Commit 594fd937 authored by Sebastian Miller's avatar Sebastian Miller
Browse files

Changed reward to per step negative absolute error. Added the continuous...

Changed reward to per step negative absolute error. Added the continuous action space env and an adequate experiment. Plugged the reward logging to Neptune.ai.
parent de0f412c
......@@ -65,8 +65,4 @@ pytest -s
## 5. Visualisation
todo tu o neptunie
(Tymczasowo używamy tensorboard)
```
tensorboard --logdir tmp/ppo_tmp/
```
\ No newline at end of file
Experiment data is automatically logged to Neptune.ai.
\ No newline at end of file
......@@ -6,6 +6,8 @@ import gym
from gym import spaces
import numpy as np
from abc import abstractmethod
# todo: może to wrzucić do środka tej klasy, razem z tym _int_to_bool_array ??
# ans: imo nie, bo to są stałe niezależne od klasy, a _int_to_bool_array to
# util też niezależny
......@@ -30,7 +32,7 @@ class EnsemblatorEnv(gym.Env):
Todo doc
"""
def __init__(self, data):
def __init__(self, data: np.ndarray):
"""
todo doc
:param data: todo
......@@ -38,20 +40,17 @@ class EnsemblatorEnv(gym.Env):
super(EnsemblatorEnv, self).__init__()
self.current_episode_len = 0
self.current_episode_predictions = []
self.current_prediction = 0
self.current_state = []
self.current_reward = 0
self.metadata = {'render.modes': ['human']}
# Data from the .csv file
self.data_predictions = data[:, FIRST_PRED_COL: FIRST_PRED_COL + INPUT_SIZE]
self.data_y = data[:, Y_COL]
# Action
# Selection of forecasters of whose predictions we take the mean.
# The selection is equivalent to the binary representation
# of the number that represents the action + 1.
self.action_space = spaces.Discrete(2 ** INPUT_SIZE - 1)
self.data_predictions: np.ndarray = data[:, FIRST_PRED_COL: FIRST_PRED_COL + INPUT_SIZE]
self.data_predictions = self.data_predictions.astype(float)
self.data_y: np.ndarray = data[:, Y_COL]
self.data_y = self.data_y.astype(float)
# State
# The predictions supplied by forecasters
......@@ -60,7 +59,7 @@ class EnsemblatorEnv(gym.Env):
self.total_step_num = len(data)
self.current_base_step = 0
def _next_observation(self):
def _next_observation(self) -> np.ndarray:
"""
Extracts the current state from data.
:return: an array with current state (predictions)
......@@ -69,15 +68,9 @@ class EnsemblatorEnv(gym.Env):
self.current_episode_len) % \
self.total_step_num]
@abstractmethod
def _take_action(self, action):
"""
Calculates the mean of selected predictions from the current
state and appends it to the data from the current episode.
:param action: a number determining the choice of forecasters
"""
self.current_episode_predictions.append(
np.mean(self.current_state[_int_to_bool_array(action + 1)])
)
raise NotImplementedError
def step(self, action):
"""
......@@ -86,36 +79,72 @@ class EnsemblatorEnv(gym.Env):
# todo wyskakuje mi warning: Type 'object' doesn't have expected attribute '__add__'
self._take_action(action)
self.current_episode_len += 1
observed_so_far = self.data_y[self.current_base_step : \
self.current_base_step + \
self.current_episode_len]
# Negative AE from current step
self.current_reward = -abs(self.current_prediction - \
self.data_y[self.current_base_step + self.current_episode_len])
# Negative MAE from timesteps in this episode so far
# todo: mam warning: Class 'list' does not define '__sub__', so the '-' operator cannot be used on its instances
# ans: masz ten warning w pycharmie bo on nie wie, że observed_so_far już jest
# typu np.ndarray (a przynajmniej powinno być), więc ma operator odejmowania
reward = -np.mean(np.abs(self.current_episode_predictions - observed_so_far))
self.current_episode_len += 1
# Reset after EPISODE_LENGTH steps
done = self.current_episode_len == EPISODE_LENGTH
self.current_state = self._next_observation()
return self.current_state, reward, done, {}
return self.current_state, self.current_reward, done, {}
def reset(self):
self.current_base_step = (self.current_base_step + \
EPISODE_LENGTH) % self.total_step_num
self.current_episode_len = 0
self.current_episode_predictions = []
self.current_state = self._next_observation()
return self.current_state
def render(self, mode='human', close=False):
print(f'Step: {self.current_base_step + self.current_episode_len}')
#print(f'Step: {self.current_base_step + self.current_episode_len}')
pass
def scroll_to_beginning(self):
"""
Sets the 0-th timestep as current step ("resets time")
"""
self.current_base_step = 0
self.current_episode_len = 0
class EnsemblatorConEnv(EnsemblatorEnv):
def __init__(self, data: np.ndarray):
super().__init__(data)
# Action
# Weights assigned to the forecasters
self.action_space = spaces.Box(low=0.0, high=1.0, shape=(INPUT_SIZE,))
def _take_action(self, action):
"""
Produces a weighted mean of the predictions.
If all of the weights are equal to 0, then the most
recent prediction is treated as the current one.
"""
weight_sum = np.sum(action)
if weight_sum > 0:
self.current_prediction = np.sum((action / weight_sum) * self.current_state)
class EnsemblatorDiscEnv(EnsemblatorEnv):
def __init__(self, data: np.ndarray):
super().__init__(data)
# Action
# Selection of forecasters of whose predictions we take the mean.
# The selection is equivalent to the binary representation
# of the number that represents the action + 1.
self.action_space = spaces.Discrete(2 ** INPUT_SIZE - 1)
def _take_action(self, action):
"""
Calculates the mean of selected predictions from the current
state and appends it to the data from the current episode.
:param action: a number determining the choice of forecasters
"""
self.current_prediction = np.mean(self.current_state[_int_to_bool_array(action + 1)])
\ No newline at end of file
import numpy as np
from src.env.EnsemblatorEnv import EnsemblatorEnv
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
import neptune.new as neptune
model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1)
class NeptuneCallback(BaseCallback):
"""
Custom callback for plotting rewards in Neptune.
"""
def __init__(self, episode_len: int, env: EnsemblatorEnv, run: neptune.Run, verbose=0):
"""
:param episode_len: The length of an episode in the env.
:param env: The training environment.
"""
super(NeptuneCallback, self).__init__(verbose)
self.episode_len = episode_len
self.env = env
self.run = run
self.current_episode_reward_sum = 0
def _on_step(self) -> bool:
self.current_episode_reward_sum += self.env.current_reward
if self.env.current_episode_len == self.episode_len - 1:
episode_reward = self.current_episode_reward_sum / self.episode_len
self.run["train/episode_reward"].log(episode_reward)
self.current_episode_reward_sum = 0
return True
\ No newline at end of file
from src.env.EnsemblatorEnv import INPUT_SIZE, EnsemblatorEnv, EPISODE_LENGTH
import neptune.new as neptune
from stable_baselines3 import PPO
import random
def run_agent(model: PPO, env: EnsemblatorEnv, run: neptune.Run, example_num: int, trained: bool):
obs = env.reset()
episode_reward_sum = 0
for i in range(example_num):
action, _states = model.predict(obs)
obs, rewards, done, info = env.step(action)
episode_reward_sum += rewards
if done:
mean_episode_reward = episode_reward_sum / EPISODE_LENGTH
if trained:
run["rewards/trained"].log(mean_episode_reward)
else:
run["rewards/random"].log(mean_episode_reward)
episode_reward_sum = 0
env.reset()
"""
def random_run(env: EnsemblatorEnv, run: neptune.Run, example_num: int):
obs = env.reset()
episode_reward_sum = 0
for i in range(example_num):
action = random.randint(1, 2**INPUT_SIZE - 1)
obs, rewards, done, info = env.step(action)
episode_reward_sum += rewards
if done:
mean_episode_reward = episode_reward_sum / EPISODE_LENGTH
episode_reward_sum = 0
env.reset()
"""
from src.models.dqn.DQN import DQN
from definitions import DQN_TMP_DIR
from stable_baselines3.common.logger import configure
#from src.models.dqn.DQN import DQN
#from definitions import DQN_TMP_DIR
#from stable_baselines3.common.logger import configure
def test_simple():
logger = configure(DQN_TMP_DIR, ["stdout", "csv", "tensorboard"])
dqn = DQN('predictions.csv', logger=logger)
dqn.learn()
dqn.check()
#def test_simple():
# logger = configure(DQN_TMP_DIR, ["stdout", "csv", "tensorboard"])
# dqn = DQN('predictions.csv', logger=logger)
# dqn.learn()
#dqn.check()
......@@ -3,18 +3,17 @@ import os
import pandas as pd
import numpy as np
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO
from src.env.EnsemblatorEnv import EnsemblatorEnv
from definitions import EXTERNAL_DATA_DIR, ROOT_DIR
from src.env.EnsemblatorEnv import EnsemblatorDiscEnv, EnsemblatorConEnv, EPISODE_LENGTH
from src.logging.NeptuneCallback import NeptuneCallback
from src.utils.experiment_utils import run_agent
from definitions import EXTERNAL_DATA_DIR
import neptune.new as neptune
from stable_baselines3.common.logger import configure
SPLIT_COLUMN = 8
def test_simple():
# todo takie rzeczy by się moveneło do jakiegoś utils który parsuje i daje łądnie sparsowane dane
df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'predictions.csv'))
......@@ -22,21 +21,31 @@ def test_simple():
example_num = len(data)
training_examples = np.count_nonzero(data[:, SPLIT_COLUMN] == 'not_test')
tmp_path = os.path.join(ROOT_DIR, 'tmp/ppo_tmp')
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])
# Vectorized environment required by PPO2
env = DummyVecEnv([lambda: EnsemblatorEnv(data)])
for i in range(2):
run = neptune.init(
api_token='[[[REDACTED]]]',
project='sebastianmiller/ensemblator-test',
name='Discrete action space' if i == 0 else 'Continuous action space'
)
# Run on the discrete environment version first.
# Then on the continuous action space version.
if i == 0:
env = EnsemblatorDiscEnv(data)
else:
env = EnsemblatorConEnv(data)
model = PPO('MlpPolicy', env, verbose=1, n_steps=128, gamma=0.01)
run_agent(model, env, run, example_num, trained=False)
env.scroll_to_beginning()
model = PPO('MlpPolicy', env, verbose=1, n_steps=64)
# Train the model
model.learn(total_timesteps=3*training_examples,
callback=NeptuneCallback(EPISODE_LENGTH, env, run))
env.scroll_to_beginning()
# Train the model
model.set_logger(new_logger)
model.learn(total_timesteps=training_examples)
run_agent(model, env, run, example_num, trained=True)
obs = env.reset()
for i in range(example_num):
action, _states = model.predict(obs)
obs, rewards, done, info = env.step(action)
env.render()
print(rewards)
run.stop()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment