diff options
author | Josip Sokcevic <sokcevic@chromium.org> | 2024-10-07 17:33:38 +0000 |
---|---|---|
committer | LUCI <gerrit-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2024-10-07 18:44:19 +0000 |
commit | 454fdaf1191c87e5c770ab865a911e10e600e178 (patch) | |
tree | a20af3e4f05b48f28d30346648ab9fa1be7f4f64 /subcmds/sync.py | |
parent | f7f9dd4deb3b92bf175a0411dac60e7b6fdd9cfa (diff) | |
download | git-repo-454fdaf1191c87e5c770ab865a911e10e600e178.tar.gz |
sync: Always use WORKER_BATCH_SIZEv2.48
With 551285fa35ccd0836513e9cf64ee8d3372e5e3f4, the comment about number
of workers no longer stands - dict is shared among multiprocesses and
real time information is available.
Using 2.7k projects as the baseline, using chunk size of 4 takes close
to 5 minutes. A chunk size of 32 takes this down to 40s - a reduction of
rougly 8 times which matches the increase.
R=gavinmak@google.com
Bug: b/371638995
Change-Id: Ida5fd8f7abc44b3b82c02aa0f7f7ae01dff5eb07
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/438523
Commit-Queue: Josip Sokcevic <sokcevic@google.com>
Tested-by: Josip Sokcevic <sokcevic@google.com>
Reviewed-by: Gavin Mak <gavinmak@google.com>
Diffstat (limited to 'subcmds/sync.py')
-rw-r--r-- | subcmds/sync.py | 27 |
1 files changed, 10 insertions, 17 deletions
diff --git a/subcmds/sync.py b/subcmds/sync.py index 0ae59f55..bebe18b9 100644 --- a/subcmds/sync.py +++ b/subcmds/sync.py | |||
@@ -131,6 +131,11 @@ def _SafeCheckoutOrder(checkouts: List[Project]) -> List[List[Project]]: | |||
131 | return res | 131 | return res |
132 | 132 | ||
133 | 133 | ||
134 | def _chunksize(projects: int, jobs: int) -> int: | ||
135 | """Calculate chunk size for the given number of projects and jobs.""" | ||
136 | return min(max(1, projects // jobs), WORKER_BATCH_SIZE) | ||
137 | |||
138 | |||
134 | class _FetchOneResult(NamedTuple): | 139 | class _FetchOneResult(NamedTuple): |
135 | """_FetchOne return value. | 140 | """_FetchOne return value. |
136 | 141 | ||
@@ -819,7 +824,6 @@ later is required to fix a server side protocol bug. | |||
819 | def _Fetch(self, projects, opt, err_event, ssh_proxy, errors): | 824 | def _Fetch(self, projects, opt, err_event, ssh_proxy, errors): |
820 | ret = True | 825 | ret = True |
821 | 826 | ||
822 | jobs = opt.jobs_network | ||
823 | fetched = set() | 827 | fetched = set() |
824 | remote_fetched = set() | 828 | remote_fetched = set() |
825 | pm = Progress( | 829 | pm = Progress( |
@@ -849,6 +853,8 @@ later is required to fix a server side protocol bug. | |||
849 | objdir_project_map.setdefault(project.objdir, []).append(project) | 853 | objdir_project_map.setdefault(project.objdir, []).append(project) |
850 | projects_list = list(objdir_project_map.values()) | 854 | projects_list = list(objdir_project_map.values()) |
851 | 855 | ||
856 | jobs = min(opt.jobs_network, len(projects_list)) | ||
857 | |||
852 | def _ProcessResults(results_sets): | 858 | def _ProcessResults(results_sets): |
853 | ret = True | 859 | ret = True |
854 | for results in results_sets: | 860 | for results in results_sets: |
@@ -888,35 +894,22 @@ later is required to fix a server side protocol bug. | |||
888 | Sync.ssh_proxy = None | 894 | Sync.ssh_proxy = None |
889 | 895 | ||
890 | # NB: Multiprocessing is heavy, so don't spin it up for one job. | 896 | # NB: Multiprocessing is heavy, so don't spin it up for one job. |
891 | if len(projects_list) == 1 or jobs == 1: | 897 | if jobs == 1: |
892 | self._FetchInitChild(ssh_proxy) | 898 | self._FetchInitChild(ssh_proxy) |
893 | if not _ProcessResults( | 899 | if not _ProcessResults( |
894 | self._FetchProjectList(opt, x) for x in projects_list | 900 | self._FetchProjectList(opt, x) for x in projects_list |
895 | ): | 901 | ): |
896 | ret = False | 902 | ret = False |
897 | else: | 903 | else: |
898 | # Favor throughput over responsiveness when quiet. It seems that | 904 | if not opt.quiet: |
899 | # imap() will yield results in batches relative to chunksize, so | ||
900 | # even as the children finish a sync, we won't see the result until | ||
901 | # one child finishes ~chunksize jobs. When using a large --jobs | ||
902 | # with large chunksize, this can be jarring as there will be a large | ||
903 | # initial delay where repo looks like it isn't doing anything and | ||
904 | # sits at 0%, but then suddenly completes a lot of jobs all at once. | ||
905 | # Since this code is more network bound, we can accept a bit more | ||
906 | # CPU overhead with a smaller chunksize so that the user sees more | ||
907 | # immediate & continuous feedback. | ||
908 | if opt.quiet: | ||
909 | chunksize = WORKER_BATCH_SIZE | ||
910 | else: | ||
911 | pm.update(inc=0, msg="warming up") | 905 | pm.update(inc=0, msg="warming up") |
912 | chunksize = 4 | ||
913 | with multiprocessing.Pool( | 906 | with multiprocessing.Pool( |
914 | jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,) | 907 | jobs, initializer=self._FetchInitChild, initargs=(ssh_proxy,) |
915 | ) as pool: | 908 | ) as pool: |
916 | results = pool.imap_unordered( | 909 | results = pool.imap_unordered( |
917 | functools.partial(self._FetchProjectList, opt), | 910 | functools.partial(self._FetchProjectList, opt), |
918 | projects_list, | 911 | projects_list, |
919 | chunksize=chunksize, | 912 | chunksize=_chunksize(len(projects_list), jobs), |
920 | ) | 913 | ) |
921 | if not _ProcessResults(results): | 914 | if not _ProcessResults(results): |
922 | ret = False | 915 | ret = False |