AI-NO PICS

My python code keeps getting killed by OOM [closed]

This is my code to simulate a walking robot

import os
import numpy as np
import gym
from gym import wrappers
# import pybullet_envs
import gym_recorder
from gym.wrappers.record_video import RecordVideo

ENV_NAME = 'BipedalWalker-v3'
# ENV_NAME = 'HalfCheetahBulletEnv-v0'

class Hp():
    # Hyperparameters
    def __init__(self,
                 nb_steps=1000,
                 episode_length=2000,
                 learning_rate=0.02,
                 num_deltas=16,
                 num_best_deltas=16,
                 noise=0.03,
                 seed=1,
                 env_name='BipedalWalker-v3',
                 record_every=50):

        self.nb_steps = nb_steps
        self.episode_length = episode_length
        self.learning_rate = learning_rate
        self.num_deltas = num_deltas
        self.num_best_deltas = num_best_deltas
        assert self.num_best_deltas <= self.num_deltas
        self.noise = noise
        self.seed = seed
        self.env_name = env_name
        self.record_every = record_every
        


class Normalizer():
    # Normalizes the inputs
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)

    def observe(self, x):
        self.n += 1.0
        last_mean = self.mean.copy()        
        self.mean += (x - self.mean) / self.n     
        self.mean_diff += (x - last_mean) * (x - self.mean)       
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)

    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std
            
        
            



class Policy():
    def __init__(self, input_size, output_size, hp):
        self.theta = np.zeros((output_size, input_size))
        self.hp = hp

    def evaluate(self, input, delta = None, direction = None):
        if direction is None:
            return self.theta.dot(input)
        elif direction == "+":
            return (self.theta + self.hp.noise * delta).dot(input)
        elif direction == "-":
            return (self.theta - self.hp.noise * delta).dot(input)

    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(self.hp.num_deltas)]

    def update(self, rollouts, sigma_rewards):
        # sigma_rewards is the standard deviation of the rewards
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, delta in rollouts:
            step += (r_pos - r_neg) * delta
        self.theta += self.hp.learning_rate / (self.hp.num_best_deltas * sigma_rewards) * step


class ArsTrainer():
    def __init__(self,
                 hp=None,
                 input_size=None,
                 output_size=None,
                 normalizer=None,
                 policy=None,
                 monitor_dir=None):

        self.hp = hp or Hp()
        np.random.seed(self.hp.seed)
        self.env = gym.make(self.hp.env_name, render_mode = "rgb_array")
        if monitor_dir is not None:
            should_record = lambda i: self.record_video
            self.env = RecordVideo(self.env, monitor_dir, episode_trigger=should_record)
        self.hp.episode_length = self.env.spec.max_episode_steps or self.hp.episode_length
        self.input_size = input_size or self.env.observation_space.shape[0] 
        self.output_size = output_size or self.env.action_space.shape[0]
        self.normalizer = normalizer or Normalizer(self.input_size)
        self.policy = policy or Policy(self.input_size, self.output_size, self.hp)
        self.record_video = False

    # Explore the policy on one specific direction and over one episode
    def explore(self, direction=None, delta=None):
        
        state = self.env.reset()[0]
        
        done = False
        num_plays = 0.0
        sum_rewards = 0.0
        while not done and num_plays < self.hp.episode_length:
            

            self.normalizer.observe(state)
            state = self.normalizer.normalize(state)
            action = self.policy.evaluate(state, delta, direction)
            state, reward, done, _, _ = self.env.step(action)
            reward = max(min(reward, 1), -1)
            sum_rewards += reward
            num_plays += 1
        
        return sum_rewards
        

    def train(self):
        for step in range(self.hp.nb_steps):
            
            # initialize the random noise deltas and the positive/negative rewards
            deltas = self.policy.sample_deltas()
            positive_rewards = [0] * self.hp.num_deltas
            negative_rewards = [0] * self.hp.num_deltas

            # play an episode each with positive deltas and negative deltas, collect rewards
            for k in range(self.hp.num_deltas):
                positive_rewards[k] = self.explore(direction="+", delta=deltas[k])
                negative_rewards[k] = self.explore(direction="-", delta=deltas[k])
                
            # Compute the standard deviation of all rewards
            sigma_rewards = np.array(positive_rewards + negative_rewards).std()

            # Sort the rollouts by the max(r_pos, r_neg) and select the deltas with best rewards
            scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
            order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:self.hp.num_best_deltas]
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]

            # Update the policy
            self.policy.update(rollouts, sigma_rewards)

            # Only record video during evaluation, every n steps
            if step % self.hp.record_every == 0:
                self.record_video = True
            # Play an episode with the new weights and print the score
            reward_evaluation = self.explore()
            print('Step: ', step, 'Reward: ', reward_evaluation)
            self.record_video = False


def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

# Main code
if __name__ == '__main__':
    videos_dir = mkdir('.', 'videos')
    monitor_dir = mkdir(videos_dir, ENV_NAME)
    hp = Hp(seed=1946, env_name=ENV_NAME)
    trainer = ArsTrainer(hp=hp, monitor_dir=monitor_dir)
    trainer.train()

I am running this code in linux Xubuntu virtual machine. It gets killed when it is in around 300 steps. It says "Killed" in the terminal. When i check the syslog file it says the following

Jan 22 16:18:46 snigdh-VirtualBox kernel: [24929.974828] Out of memory: Killed process 58156 (python3) total-vm:2734864kB, anon-rss:2325768kB, file-rss:0kB, shmem-rss:4kB, UID:1000 pgtables:5000kB oom_score_adj:0
Jan 22 16:18:46 snigdh-VirtualBox systemd[1]: session-c1.scope: A process of this unit has been killed by the OOM killer.

I tried to use
print(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
on each step but it dosent show value greater than 900.
I also tried using

self.mean = np.zeros(shape, dtype=np.float16)

instead of

self.mean = np.zeros(shape)

but it causes the reward to stop increasing(probably because the low precision causes the reinforcement learning model to not learn properly) so i need help, what should i do?

For feedback or comments, reach us on hello@newswire.ae

admin.newswire

admin.newswire

About Author

Leave a comment

Your email address will not be published. Required fields are marked *

You may also like

AI-NO PICS

Looking for best option to export data into achieved then use the data to search for best match based on the certain parameters

We have an old way of storing the archive data in SQL Server and have an algorithm to retrieve the