From d246d1fee7f42f2526a20a96597c8f01eda31433 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 24 Feb 2021 12:50:30 -0500 Subject: grep: add --jobs support Use multiprocessing to run in parallel. When operating on multiple projects, this can greatly speed things up. Across 1000 repos, it goes from ~40sec to ~16sec with the default -j8. The output processing does not appear to be a significant bottle neck -- it accounts for <1sec out of the ~16sec runtime. Thus we leave it in the main thread to simplify the code. Change-Id: I750b72c7711b0c5d26e65d480738fbaac3a69971 Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/297984 Reviewed-by: Chris Mcdonald Tested-by: Mike Frysinger --- subcmds/grep.py | 118 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 46 deletions(-) (limited to 'subcmds/grep.py') diff --git a/subcmds/grep.py b/subcmds/grep.py index c16d4185..49feaf6b 100644 --- a/subcmds/grep.py +++ b/subcmds/grep.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools +import multiprocessing import sys from color import Coloring -from command import PagedCommand +from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE from error import GitError from git_command import GitCommand @@ -61,6 +63,7 @@ contain a line that matches both expressions: repo grep --all-match -e NODE -e Unexpected """ + PARALLEL_JOBS = DEFAULT_LOCAL_JOBS @staticmethod def _carry_option(_option, opt_str, value, parser): @@ -80,6 +83,7 @@ contain a line that matches both expressions: pt.append(value) def _Options(self, p): + super()._Options(p) g = p.add_option_group('Sources') g.add_option('--cached', action='callback', callback=self._carry_option, @@ -152,73 +156,49 @@ contain a line that matches both expressions: action='callback', callback=self._carry_option, help='Show only file names not containing matching lines') - def Execute(self, opt, args): - out = GrepColoring(self.manifest.manifestProject.config) - - cmd_argv = ['grep'] - if out.is_on: - cmd_argv.append('--color') - cmd_argv.extend(getattr(opt, 'cmd_argv', [])) - - if '-e' not in cmd_argv: - if not args: - self.Usage() - cmd_argv.append('-e') - cmd_argv.append(args[0]) - args = args[1:] - - projects = self.GetProjects(args) + def _ExecuteOne(self, cmd_argv, project): + """Process one project.""" + try: + p = GitCommand(project, + cmd_argv, + bare=False, + capture_stdout=True, + capture_stderr=True) + except GitError as e: + return (project, -1, None, str(e)) - full_name = False - if len(projects) > 1: - cmd_argv.append('--full-name') - full_name = True - - have_rev = False - if opt.revision: - if '--cached' in cmd_argv: - print('fatal: cannot combine --cached and --revision', file=sys.stderr) - sys.exit(1) - have_rev = True - cmd_argv.extend(opt.revision) - cmd_argv.append('--') + return (project, p.Wait(), p.stdout, p.stderr) + @staticmethod + def _ProcessResults(out, full_name, have_rev, results): git_failed = False bad_rev = False have_match = False - for project in projects: - try: - p = GitCommand(project, - cmd_argv, - bare=False, - capture_stdout=True, - capture_stderr=True) - except GitError as e: + for project, rc, stdout, stderr in results: + if rc < 0: git_failed = True out.project('--- project %s ---' % project.relpath) out.nl() - out.fail('%s', str(e)) + out.fail('%s', stderr) out.nl() continue - if p.Wait() != 0: + if rc: # no results - # - if p.stderr: - if have_rev and 'fatal: ambiguous argument' in p.stderr: + if stderr: + if have_rev and 'fatal: ambiguous argument' in stderr: bad_rev = True else: out.project('--- project %s ---' % project.relpath) out.nl() - out.fail('%s', p.stderr.strip()) + out.fail('%s', stderr.strip()) out.nl() continue have_match = True # We cut the last element, to avoid a blank line. - # - r = p.stdout.split('\n') + r = stdout.split('\n') r = r[0:-1] if have_rev and full_name: @@ -240,6 +220,52 @@ contain a line that matches both expressions: for line in r: print(line) + return (git_failed, bad_rev, have_match) + + def Execute(self, opt, args): + out = GrepColoring(self.manifest.manifestProject.config) + + cmd_argv = ['grep'] + if out.is_on: + cmd_argv.append('--color') + cmd_argv.extend(getattr(opt, 'cmd_argv', [])) + + if '-e' not in cmd_argv: + if not args: + self.Usage() + cmd_argv.append('-e') + cmd_argv.append(args[0]) + args = args[1:] + + projects = self.GetProjects(args) + + full_name = False + if len(projects) > 1: + cmd_argv.append('--full-name') + full_name = True + + have_rev = False + if opt.revision: + if '--cached' in cmd_argv: + print('fatal: cannot combine --cached and --revision', file=sys.stderr) + sys.exit(1) + have_rev = True + cmd_argv.extend(opt.revision) + cmd_argv.append('--') + + process_results = functools.partial( + self._ProcessResults, out, full_name, have_rev) + # NB: Multiprocessing is heavy, so don't spin it up for one job. + if len(projects) == 1 or opt.jobs == 1: + git_failed, bad_rev, have_match = process_results( + self._ExecuteOne(cmd_argv, x) for x in projects) + else: + with multiprocessing.Pool(opt.jobs) as pool: + results = pool.imap( + functools.partial(self._ExecuteOne, cmd_argv), projects, + chunksize=WORKER_BATCH_SIZE) + git_failed, bad_rev, have_match = process_results(results) + if git_failed: sys.exit(1) elif have_match: -- cgit v1.2.3-54-g00ecf