My python code keeps getting killed by OOM [closed]

This is my code to simulate a walking robot
import os
import numpy as np
import gym
from gym import wrappers
# import pybullet_envs
import gym_recorder
from gym.wrappers.record_video import RecordVideo
ENV_NAME = 'BipedalWalker-v3'
# ENV_NAME = 'HalfCheetahBulletEnv-v0'
class Hp():
# Hyperparameters
def __init__(self,
nb_steps=1000,
episode_length=2000,
learning_rate=0.02,
num_deltas=16,
num_best_deltas=16,
noise=0.03,
seed=1,
env_name='BipedalWalker-v3',
record_every=50):
self.nb_steps = nb_steps
self.episode_length = episode_length
self.learning_rate = learning_rate
self.num_deltas = num_deltas
self.num_best_deltas = num_best_deltas
assert self.num_best_deltas <= self.num_deltas
self.noise = noise
self.seed = seed
self.env_name = env_name
self.record_every = record_every
class Normalizer():
# Normalizes the inputs
def __init__(self, nb_inputs):
self.n = np.zeros(nb_inputs)
self.mean = np.zeros(nb_inputs)
self.mean_diff = np.zeros(nb_inputs)
self.var = np.zeros(nb_inputs)
def observe(self, x):
self.n += 1.0
last_mean = self.mean.copy()
self.mean += (x - self.mean) / self.n
self.mean_diff += (x - last_mean) * (x - self.mean)
self.var = (self.mean_diff / self.n).clip(min = 1e-2)
def normalize(self, inputs):
obs_mean = self.mean
obs_std = np.sqrt(self.var)
return (inputs - obs_mean) / obs_std
class Policy():
def __init__(self, input_size, output_size, hp):
self.theta = np.zeros((output_size, input_size))
self.hp = hp
def evaluate(self, input, delta = None, direction = None):
if direction is None:
return self.theta.dot(input)
elif direction == "+":
return (self.theta + self.hp.noise * delta).dot(input)
elif direction == "-":
return (self.theta - self.hp.noise * delta).dot(input)
def sample_deltas(self):
return [np.random.randn(*self.theta.shape) for _ in range(self.hp.num_deltas)]
def update(self, rollouts, sigma_rewards):
# sigma_rewards is the standard deviation of the rewards
step = np.zeros(self.theta.shape)
for r_pos, r_neg, delta in rollouts:
step += (r_pos - r_neg) * delta
self.theta += self.hp.learning_rate / (self.hp.num_best_deltas * sigma_rewards) * step
class ArsTrainer():
def __init__(self,
hp=None,
input_size=None,
output_size=None,
normalizer=None,
policy=None,
monitor_dir=None):
self.hp = hp or Hp()
np.random.seed(self.hp.seed)
self.env = gym.make(self.hp.env_name, render_mode = "rgb_array")
if monitor_dir is not None:
should_record = lambda i: self.record_video
self.env = RecordVideo(self.env, monitor_dir, episode_trigger=should_record)
self.hp.episode_length = self.env.spec.max_episode_steps or self.hp.episode_length
self.input_size = input_size or self.env.observation_space.shape[0]
self.output_size = output_size or self.env.action_space.shape[0]
self.normalizer = normalizer or Normalizer(self.input_size)
self.policy = policy or Policy(self.input_size, self.output_size, self.hp)
self.record_video = False
# Explore the policy on one specific direction and over one episode
def explore(self, direction=None, delta=None):
state = self.env.reset()[0]
done = False
num_plays = 0.0
sum_rewards = 0.0
while not done and num_plays < self.hp.episode_length:
self.normalizer.observe(state)
state = self.normalizer.normalize(state)
action = self.policy.evaluate(state, delta, direction)
state, reward, done, _, _ = self.env.step(action)
reward = max(min(reward, 1), -1)
sum_rewards += reward
num_plays += 1
return sum_rewards
def train(self):
for step in range(self.hp.nb_steps):
# initialize the random noise deltas and the positive/negative rewards
deltas = self.policy.sample_deltas()
positive_rewards = [0] * self.hp.num_deltas
negative_rewards = [0] * self.hp.num_deltas
# play an episode each with positive deltas and negative deltas, collect rewards
for k in range(self.hp.num_deltas):
positive_rewards[k] = self.explore(direction="+", delta=deltas[k])
negative_rewards[k] = self.explore(direction="-", delta=deltas[k])
# Compute the standard deviation of all rewards
sigma_rewards = np.array(positive_rewards + negative_rewards).std()
# Sort the rollouts by the max(r_pos, r_neg) and select the deltas with best rewards
scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:self.hp.num_best_deltas]
rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
# Update the policy
self.policy.update(rollouts, sigma_rewards)
# Only record video during evaluation, every n steps
if step % self.hp.record_every == 0:
self.record_video = True
# Play an episode with the new weights and print the score
reward_evaluation = self.explore()
print('Step: ', step, 'Reward: ', reward_evaluation)
self.record_video = False
def mkdir(base, name):
path = os.path.join(base, name)
if not os.path.exists(path):
os.makedirs(path)
return path
# Main code
if __name__ == '__main__':
videos_dir = mkdir('.', 'videos')
monitor_dir = mkdir(videos_dir, ENV_NAME)
hp = Hp(seed=1946, env_name=ENV_NAME)
trainer = ArsTrainer(hp=hp, monitor_dir=monitor_dir)
trainer.train()
I am running this code in linux Xubuntu virtual machine. It gets killed when it is in around 300 steps. It says "Killed" in the terminal. When i check the syslog file it says the following
Jan 22 16:18:46 snigdh-VirtualBox kernel: [24929.974828] Out of memory: Killed process 58156 (python3) total-vm:2734864kB, anon-rss:2325768kB, file-rss:0kB, shmem-rss:4kB, UID:1000 pgtables:5000kB oom_score_adj:0
Jan 22 16:18:46 snigdh-VirtualBox systemd[1]: session-c1.scope: A process of this unit has been killed by the OOM killer.
I tried to use
print(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
on each step but it dosent show value greater than 900.
I also tried using
self.mean = np.zeros(shape, dtype=np.float16)
instead of
self.mean = np.zeros(shape)
but it causes the reward to stop increasing(probably because the low precision causes the reinforcement learning model to not learn properly) so i need help, what should i do?
For feedback or comments, reach us on hello@newswire.ae