sync: Always use WORKER_BATCH_SIZEv2.48

With 551285fa35ccd0836513e9cf64ee8d3372e5e3f4, the comment about number of workers no longer stands - dict is shared among multiprocesses and real time information is available. Using 2.7k projects as the baseline, using chunk size of 4 takes close to 5 minutes. A chunk size of 32 takes this down to 40s - a reduction of rougly 8 times which matches the increase. R=gavinmak@google.com Bug: b/371638995 Change-Id: Ida5fd8f7abc44b3b82c02aa0f7f7ae01dff5eb07 Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/438523 Commit-Queue: Josip Sokcevic <sokcevic@google.com> Tested-by: Josip Sokcevic <sokcevic@google.com> Reviewed-by: Gavin Mak <gavinmak@google.com>
author: Josip Sokcevic <sokcevic@chromium.org> 2024-10-07 17:33:38 +0000
committer: LUCI <gerrit-scoped@luci-project-accounts.iam.gserviceaccount.com> 2024-10-07 18:44:19 +0000
commit: 454fdaf1191c87e5c770ab865a911e10e600e178 (patch)
tree: a20af3e4f05b48f28d30346648ab9fa1be7f4f64 /subcmds/sync.py
parent: f7f9dd4deb3b92bf175a0411dac60e7b6fdd9cfa (diff)
download: git-repo-454fdaf1191c87e5c770ab865a911e10e600e178.tar.gz
1 files changed, 10 insertions, 17 deletions
diff --git a/subcmds/sync.py b/subcmds/sync.py
index 0ae59f55..bebe18b9 100644
--- a/subcmds/sync.py
+++ b/subcmds/sync.py
@@ -131,6 +131,11 @@ def _SafeCheckoutOrder(checkouts: List[Project]) -> List[List[Project]]:
    return res
+def _chunksize(projects: int, jobs: int) -> int:
+    """Calculate chunk size for the given number of projects and jobs."""
+    return min(max(1, projects // jobs), WORKER_BATCH_SIZE)
 class _FetchOneResult(NamedTuple):
    """_FetchOne return value.
@@ -819,7 +824,6 @@ later is required to fix a server side protocol bug.
    def _Fetch(self, projects, opt, err_event, ssh_proxy, errors):
        ret = True
-        jobs = opt.jobs_network
        fetched = set()
        remote_fetched = set()
        pm = Progress(
@@ -849,6 +853,8 @@ later is required to fix a server side protocol bug.
            objdir_project_map.setdefault(project.objdir, []).append(project)
        projects_list = list(objdir_project_map.values())
+        jobs = min(opt.jobs_network, len(projects_list))
        def _ProcessResults(results_sets):
            ret = True
            for results in results_sets:
@@ -888,35 +894,22 @@ later is required to fix a server side protocol bug.
        Sync.ssh_proxy = None
        # NB: Multiprocessing is heavy, so don't spin it up for one job.
-        if len(projects_list) == 1 or jobs == 1:
+        if jobs == 1:
            self._FetchInitChild(ssh_proxy)
            if not _ProcessResults(
                self._FetchProjectList(opt, x) for x in projects_list
            ):
                ret = False
        else:
-            # Favor throughput over responsiveness when quiet.  It seems that
+            if not opt.quiet:
-            # imap() will yield results in batches relative to chunksize, so
-            # even as the children finish a sync, we won't see the result until
-            # one child finishes ~chunksize jobs.  When using a large --jobs
-            # with large chunksize, this can be jarring as there will be a large
-            # initial delay where repo looks like it isn't doing anything and
-            # sits at 0%, but then suddenly completes a lot of jobs all at once.
-            # Since this code is more network bound, we can accept a bit more
-            # CPU overhead with a smaller chunksize so that the user sees more
-            # immediate & continuous feedback.
-            if opt.quiet:
-                chunksize = WORKER_BATCH_SIZE
-            else:
                pm.update(inc=0, msg="warming up")
-                chunksize = 4
            with multiprocessing.Pool(
                jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,)
            ) as pool:
                results = pool.imap_unordered(
                    functools.partial(self._FetchProjectList, opt),
                    projects_list,
-                    chunksize=chunksize,
+                    chunksize=_chunksize(len(projects_list), jobs),
                )
                if not _ProcessResults(results):
                    ret = False
author	Josip Sokcevic <sokcevic@chromium.org>	2024-10-07 17:33:38 +0000
committer	LUCI <gerrit-scoped@luci-project-accounts.iam.gserviceaccount.com>	2024-10-07 18:44:19 +0000
commit	454fdaf1191c87e5c770ab865a911e10e600e178 (patch)
tree	a20af3e4f05b48f28d30346648ab9fa1be7f4f64 /subcmds/sync.py
parent	f7f9dd4deb3b92bf175a0411dac60e7b6fdd9cfa (diff)
download	git-repo-454fdaf1191c87e5c770ab865a911e10e600e178.tar.gz