Delay entering TrialRunner context until run_trial (#970)

bpkroth · web-flow · commit 19028d81d006 · 2025-05-12T13:42:21.000-07:00
# Pull Request ## Title Delay entering `TrialRunner` context until `run_trial`. ______________________________________________________________________ ## Description This is part of an attempt to try and see if can work around issues with `multiprocessing.Pool` needing to pickle certain objects when forking. For instance, if the Environment is using an SshServer, we need to start an EventLoopContext in the background to handle the SSH connections and threads are not picklable. Nor are file handles, DB connections, etc., so there may be other things we also need to adjust to make this work. See Also #967 ______________________________________________________________________ ## Type of Change - 🛠️ Bug fix - 🔄 Refactor ______________________________________________________________________ ## Testing - Light so far (still in draft mode) - Just basic existing CI tests (seems to not break anything) ______________________________________________________________________ ## Additional Notes (optional) I think this is incomplete. To support forking inside the Scheduler and *then* entering the context of the given TrialRunner, we may also need to do something about the Scheduler's Storage object. That was true, those PRs are now forthcoming. See Also #971 For now this is a draft PR to allow @jsfreischuetz and I to play with alternative organizations of #967. ______________________________________________________________________
diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py
@@ -200,8 +200,9 @@ def __enter__(self) -> "Scheduler":
         _LOG.debug("Scheduler START :: %s", self)
         assert self.experiment is None
         assert not self._in_context
-        for trial_runner in self._trial_runners.values():
-            trial_runner.__enter__()
+        # NOTE: We delay entering the context of trial_runners until it's time
+        # to run the trial in order to avoid incompatibilities with
+        # multiprocessing.Pool.
         self._optimizer.__enter__()
         # Start new or resume the existing experiment. Verify that the
         # experiment configuration is compatible with the previous runs.
@@ -235,7 +236,8 @@ def __exit__(
         self._experiment.__exit__(ex_type, ex_val, ex_tb)
         self._optimizer.__exit__(ex_type, ex_val, ex_tb)
         for trial_runner in self._trial_runners.values():
-            trial_runner.__exit__(ex_type, ex_val, ex_tb)
+            # TrialRunners should have already exited their context after running the Trial.
+            assert not trial_runner._in_context  # pylint: disable=protected-access
         self._experiment = None
         self._in_context = False
         return False  # Do not suppress exceptions
@@ -267,7 +269,8 @@ def teardown(self) -> None:
         if self._do_teardown:
             for trial_runner in self._trial_runners.values():
                 assert not trial_runner.is_running
-                trial_runner.teardown()
+                with trial_runner:
+                    trial_runner.teardown()
 
     def get_best_observation(self) -> tuple[dict[str, float] | None, TunableGroups | None]:
         """Get the best observation from the optimizer."""
diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py
@@ -39,5 +39,6 @@ def run_trial(self, trial: Storage.Trial) -> None:
         super().run_trial(trial)
         # In the sync scheduler we run each trial on its own TrialRunner in sequence.
         trial_runner = self.get_trial_runner(trial)
-        trial_runner.run_trial(trial, self.global_config)
-        _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner)
+        with trial_runner:
+            trial_runner.run_trial(trial, self.global_config)
+            _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner)