Fix interface for Pendulum-v1 and new gym versions

LemonPi · LemonPi · commit efe296766fa6 · 2023-12-14T16:42:49.000-05:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pytorch_mppi"
-version = "0.7.2"
+version = "0.7.3"
 description = "Model Predictive Path Integral (MPPI) implemented in pytorch"
 readme = "README.md" # Optional
 
@@ -73,7 +73,7 @@ tune = [
 ]
 test = [
     "pytest",
-    'gym<=0.20',
+    'gym',
     'pygame',
     'pyglet==1.5.27',
     'window-recorder',
diff --git a/src/pytorch_mppi/mppi.py b/src/pytorch_mppi/mppi.py
@@ -203,6 +203,14 @@ def _dynamics(self, state, u, t):
     def _running_cost(self, state, u, t):
         return self.running_cost(state, u, t) if self.step_dependency else self.running_cost(state, u)
 
+    def shift_nominal_trajectory(self):
+        """
+        Shift the nominal trajectory forward one step
+        """
+        # shift command 1 time step
+        self.U = torch.roll(self.U, -1, dims=0)
+        self.U[-1] = self.u_init
+
     def command(self, state, shift_nominal_trajectory=True):
         """
         :param state: (nx) or (K x nx) current state, or samples of states (for propagating a distribution of states)
@@ -211,9 +219,7 @@ def command(self, state, shift_nominal_trajectory=True):
         :returns action: (nu) best action
         """
         if shift_nominal_trajectory:
-            # shift command 1 time step
-            self.U = torch.roll(self.U, -1, dims=0)
-            self.U[-1] = self.u_init
+            self.shift_nominal_trajectory()
 
         return self._command(state)
 
@@ -360,11 +366,12 @@ def run_mppi(mppi, env, retrain_dynamics, retrain_after_iter=50, iter=1000, rend
     dataset = torch.zeros((retrain_after_iter, mppi.nx + mppi.nu), dtype=mppi.U.dtype, device=mppi.d)
     total_reward = 0
     for i in range(iter):
-        state = env.state.copy()
+        state = env.unwrapped.state.copy()
         command_start = time.perf_counter()
         action = mppi.command(state)
         elapsed = time.perf_counter() - command_start
-        s, r, _, _ = env.step(action.cpu().numpy())
+        res = env.step(action.cpu().numpy())
+        s, r = res[0], res[1]
         total_reward += r
         logger.debug("action taken: %.4f cost received: %.4f time taken: %.5fs", action, -r, elapsed)
         if render:
diff --git a/tests/pendulum.py b/tests/pendulum.py
@@ -4,7 +4,7 @@
 import logging
 import math
 from pytorch_mppi import mppi
-from gym import wrappers, logger as gym_log
+from gym import logger as gym_log
 
 gym_log.set_level(gym_log.INFO)
 logger = logging.getLogger(__name__)
@@ -13,7 +13,7 @@
                     datefmt='%m-%d %H:%M:%S')
 
 if __name__ == "__main__":
-    ENV_NAME = "Pendulum-v0"
+    ENV_NAME = "Pendulum-v1"
     TIMESTEPS = 15  # T
     N_SAMPLES = 100  # K
     ACTION_LOW = -2.0
@@ -40,9 +40,9 @@ def dynamics(state, perturbed_action):
         u = perturbed_action
         u = torch.clamp(u, -2, 2)
 
-        newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
+        newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l ** 2) * u) * dt
+        newthdot = np.clip(newthdot, -8, 8)
         newth = th + newthdot * dt
-        newthdot = torch.clamp(newthdot, -8, 8)
 
         state = torch.cat((newth, newthdot), dim=1)
         return state
@@ -65,18 +65,15 @@ def train(new_data):
 
 
     downward_start = True
-    env = gym.make(ENV_NAME).env  # bypass the default TimeLimit wrapper
-    env.reset()
-    if downward_start:
-        env.state = [np.pi, 1]
+    env = gym.make(ENV_NAME, render_mode="human")
 
-    env = wrappers.Monitor(env, '/tmp/mppi/', force=True)
     env.reset()
     if downward_start:
-        env.env.state = [np.pi, 1]
+        env.state = env.unwrapped.state = [np.pi, 1]
 
     nx = 2
     mppi_gym = mppi.MPPI(dynamics, running_cost, nx, noise_sigma, num_samples=N_SAMPLES, horizon=TIMESTEPS,
-                         lambda_=lambda_)
+                         lambda_=lambda_, u_min=torch.tensor(ACTION_LOW, device=d),
+                         u_max=torch.tensor(ACTION_HIGH, device=d), device=d)
     total_reward = mppi.run_mppi(mppi_gym, env, train)
     logger.info("Total reward %f", total_reward)
diff --git a/tests/pendulum_approximate.py b/tests/pendulum_approximate.py
@@ -4,7 +4,7 @@
 import logging
 import math
 from pytorch_mppi import mppi
-from gym import wrappers, logger as gym_log
+from gym import logger as gym_log
 
 gym_log.set_level(gym_log.INFO)
 logger = logging.getLogger(__name__)
@@ -13,7 +13,7 @@
                     datefmt='%m-%d %H:%M:%S')
 
 if __name__ == "__main__":
-    ENV_NAME = "Pendulum-v0"
+    ENV_NAME = "Pendulum-v1"
     TIMESTEPS = 30  # T
     N_SAMPLES = 1000  # K
     ACTION_LOW = -2.0
@@ -168,10 +168,10 @@ def train(new_data):
 
 
     downward_start = True
-    env = gym.make(ENV_NAME).env  # bypass the default TimeLimit wrapper
+    env = gym.make(ENV_NAME, render_mode="human").env  # bypass the default TimeLimit wrapper
     env.reset()
     if downward_start:
-        env.state = [np.pi, 1]
+        env.state = env.unwrapped.state = [np.pi, 1]
 
     # bootstrap network with random actions
     if BOOT_STRAP_ITER:
@@ -188,10 +188,9 @@ def train(new_data):
         train(new_data)
         logger.info("bootstrapping finished")
 
-    env = wrappers.Monitor(env, '/tmp/mppi/', force=True)
     env.reset()
     if downward_start:
-        env.env.state = [np.pi, 1]
+        env.state = env.unwrapped.state = [np.pi, 1]
 
     mppi_gym = mppi.MPPI(dynamics, running_cost, nx, noise_sigma, num_samples=N_SAMPLES, horizon=TIMESTEPS,
                          lambda_=lambda_, device=d, u_min=torch.tensor(ACTION_LOW, dtype=torch.double, device=d),
diff --git a/tests/pendulum_approximate_continuous.py b/tests/pendulum_approximate_continuous.py
@@ -8,7 +8,7 @@
 import logging
 import math
 from pytorch_mppi import mppi
-from gym import wrappers, logger as gym_log
+from gym import logger as gym_log
 
 gym_log.set_level(gym_log.INFO)
 logger = logging.getLogger(__name__)
@@ -17,7 +17,7 @@
                     datefmt='%m-%d %H:%M:%S')
 
 if __name__ == "__main__":
-    ENV_NAME = "Pendulum-v0"
+    ENV_NAME = "Pendulum-v1"
     TIMESTEPS = 15  # T
     N_SAMPLES = 100  # K
     ACTION_LOW = -2.0
@@ -87,9 +87,9 @@ def true_dynamics(state, perturbed_action):
         u = perturbed_action
         u = torch.clamp(u, -2, 2)
 
-        newthdot = thdot + (-3 * g / (2 * l) * torch.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
+        newthdot = thdot + (3 * g / (2 * l) * torch.sin(th) + 3.0 / (m * l**2) * u) * dt
+        newthdot = torch.clip(newthdot, -8, 8)
         newth = th + newthdot * dt
-        newthdot = torch.clamp(newthdot, -8, 8)
 
         state = torch.cat((newth, newthdot), dim=1)
         return state
@@ -176,10 +176,10 @@ def train(new_data):
 
 
     downward_start = True
-    env = gym.make(ENV_NAME).env  # bypass the default TimeLimit wrapper
+    env = gym.make(ENV_NAME, render_mode="human").env  # bypass the default TimeLimit wrapper
     env.reset()
     if downward_start:
-        env.state = [np.pi, 1]
+        env.state = env.unwrapped.state = [np.pi, 1]
 
     # bootstrap network with random actions
     if BOOT_STRAP_ITER:
@@ -196,12 +196,11 @@ def train(new_data):
         train(new_data)
         logger.info("bootstrapping finished")
 
-    env = wrappers.Monitor(env, '/tmp/mppi/', force=True)
     env.reset()
     if downward_start:
-        env.env.state = [np.pi, 1]
+        env.state = env.unwrapped.state = [np.pi, 1]
 
-    mppi_gym = mppi.MPPI(dynamics, running_cost, nx, noise_sigma, num_samples=N_SAMPLES, horizon=TIMESTEPS,
+    mppi_gym = mppi.MPPI(true_dynamics, running_cost, nx, noise_sigma, num_samples=N_SAMPLES, horizon=TIMESTEPS,
                          lambda_=lambda_, device=d, u_min=torch.tensor(ACTION_LOW, dtype=torch.double, device=d),
                          u_max=torch.tensor(ACTION_HIGH, dtype=torch.double, device=d))
     total_reward, data = mppi.run_mppi(mppi_gym, env, train)