Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
Ilija Vukotic committed Apr 30, 2020
1 parent 9c60a98 commit 9027048
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 22 deletions.
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,31 @@ install it:
python -m pip install --user -e .
</code>

then import it like this:
import it like this:

<code>
import gym
gym.make('gym_cache:Cache-v0')
</code>

There are two discrete action environments (*Cache-v0* and *Cache-large-v0*) and one discrete action environment (*Cache-continuous-v0*).


observation space has following variables:
* six tokens (integers)
* size \[kB\]
* file size \[kB\]
* how full is the cache at that moment

There are two discrete action environments (*Cache-v0* and *Cache-large-v0*) and one discrete action environment (*Cache-continuous-v0*).



## Data extractions and preprocessing
This is a two step procedure:
* *extract raw data* _data/extract_data.py_ - change PQ, date range
* *process raw data* _data/process_data.py_ - tokenizes filenames, generates unique fileIDs, sorts by access time.

Processed data should be copied to the directory where actor runs.
It is an hdf5 file with one dataframe:
* index - access time (sorted)
* six tokens derived from the filename ('1', '2', '3', '4', '5', '6')
* filesize ('kB')
* unique file identifier ('fID')
17 changes: 1 addition & 16 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,3 @@
reset should return first access (first state)
add visualization on cache hit rate.
add actor cleaning step:
* move from hwm/lwm cleanup to cleanup for each file and have hwm:98%
* ask actor for decission to remove or not. List files in order of LRU. Signal to not learn is given by listing file size as negative value.
* ask actor for decission to remove or not. List files in order of LRU. Signal to not learn is given in comment dict.

# input generation
values:
1. filesize
2. scope index
3. dataset index
4. filename index
5. 10 tokens
* remove scope tokens from dataset and filename tokens
* remove dataset tokens from filename tokens


WHY IS INSPECTOR SO F... SLOW?
2 changes: 1 addition & 1 deletion gym_cache/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
entry_point='gym_cache.envs:CacheContinousEnv',
kwargs={
'InputData': 'data/ANALY_MWT2_UCORE_processed',
'CacheSize': 1024 * 1024 * 1024
'CacheSize': 100 * 1024 * 1024 * 1024
},
# reward_threshold=1.0,
max_episode_steps=20000000,
Expand Down
4 changes: 3 additions & 1 deletion gym_cache/envs/cache_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ class CacheEnv(gym.Env):

def __init__(self, InputData, CacheSize):

self.name = '100TB_LRU'
# self.name = '100TB_LRU'
self.name = '100TB_DDQN'
# self.name = 'InfiniteCache_DDQN'

self.accesses_filename = InputData + '.h5'

Expand Down
175 changes: 175 additions & 0 deletions gym_cache/envs/cache_env_full.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import gym
from gym import spaces
# from gym import error, utils
from gym.utils import seeding
import pandas as pd
import numpy as np

import logging
logger = logging.getLogger(__name__)


class CacheEnv(gym.Env):

metadata = {'render.modes': ['human']}
actions_num = 1 # best guess if the file is in the cache/should be kept in cache

def __init__(self, InputData, CacheSize):

# self.name = '100TB_LRU'
self.name = '100TB_DDQN'
# self.name = 'InfiniteCache_DDQN'

self.accesses_filename = InputData + '.h5'

self.load_access_data()
self.seed() # probably not needed

self.cache_value_weight = 1.0 # applies only on files already in cache

self.cache_size = CacheSize
self.cache_hwm = .95 * self.cache_size
self.cache_lwm = .90 * self.cache_size
self.cache_kbytes = 0
self.cache_content = {}
self.files_processed = 0
self.data_processed = 0

self.cleanup = False # state of running
self.clean_list = None
self.clean_counter = 0

self.monitoring = []

self.weight = 0 # delivered in next cycle.
self.found_in_cache = False # from previous cycle

self.viewer = None

maxes = self.accesses.max()
self.action_space = spaces.Discrete(2)
self.observation_space = spaces.Box(
# first 6 are tokens, 7th is filesize, 8th is how full is cache at the moment
low=np.array([0, 0, 0, 0, 0, 0, 0, 0]),
high=np.array([maxes[0], maxes[1], maxes[2], maxes[3], maxes[4], maxes[5], maxes[6], 100]),
dtype=np.int32
)
print('environment loaded! cache size [kB]:', self.cache_size)

def load_access_data(self):
# last variable is the fileID.
with pd.HDFStore(self.accesses_filename) as hdf:
# print("keys in file:", self.accesses_filename, ':', hdf.keys())
self.accesses = hdf.select('data')
print("accesses loaded:", self.accesses.shape[0])

def save_monitoring_data(self):
mdata = pd.DataFrame(self.monitoring, columns=['kB', 'cache size', 'cache hit', 'reward'])
mdata.to_hdf('results/' + self.name + '.h5', key=self.name, mode='w', complevel=1)

def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]

def _cache_cleanup(self, action):

if action == 0:
print('cleaning previous file')
row = self.clean_list.iloc[self.clean_counter, :]
self.cache_kbytes -= row['fs']
del self.cache_content[row.name]

self.clean_counter += 1
if self.clean_counter == self.clean_list.shape[0]:
print('failed to cleanup enough data.')
return np.array(), 0, True, {'cleanup': True}

row = self.clean_list.iloc[self.clean_counter, :]
state = [row['1'], row['2'], row['3'], row['4'], row['5'], row['6'],
row['kB'], self.cache_kbytes * 100 // self.cache_size]

# check if cleaning needs to be stopped.
# The last file will still be asked about.
if self.cache_kbytes < self.cache_lwm:
del self.clean_list
self.cleanup = False

return np.array(state), 0, False, {'cleanup': True}

def _init_cleanup(self):
self.cleanup = True
# order files by access instance
self.clean_list = pd.DataFrame.from_dict(
self.cache_content, orient='index', columns=['accNo', 'fs']
).sort_values(by='accNo', axis=0)
self.clean_counter = 0

def step(self, action):

# check if we are in regular running or cleanup mode
if self.cleanup:
return self._cache_cleanup(action)
else:
if self.cache_kbytes > self.cache_hwm:
# print('cache cleanup on access:', self.files_processed, 'cache filled:', self.cache_kbytes)
self._init_cleanup()

# calculate reward from old weight, was it in cache and action
reward = self.weight
if (self.found_in_cache and action == 0) or (not self.found_in_cache and action == 1):
reward = -reward

row = self.accesses.iloc[self.files_processed, :]
fID = row['fID']
fs = row['kB']
# print(row['1'], row['2'], row['3'], row['4'], row['5'], row['6'], row['kB'], row['fID'])

self.found_in_cache = fID in self.cache_content
# print('found in cache', self.found_in_cache, fID, self.cache_content)
if self.found_in_cache:
# print('cache hit - 5%')
self.weight = fs * self.cache_value_weight
else:
# print('cache miss - 100%')
self.weight = fs
self.cache_kbytes += fs

self.cache_content[fID] = (self.files_processed, fs)

self.monitoring.append([fs, self.cache_kbytes, self.found_in_cache, int(reward)])

self.files_processed += 1
self.data_processed += fs

state = [row['1'], row['2'], row['3'], row['4'], row['5'], row['6'],
fs, self.cache_kbytes * 100 // self.cache_size]

return np.array(state), int(reward), False, {'cleanup': False}

def reset(self):
self.files_processed = 0
self.cache_content = {}
self.cache_kbytes = 0
self.monitoring = []

return self.step(0)[0]

def render(self, mode='human'):
# screen_width = 600
# screen_height = 400
# if self.viewer is None: # creation of entities.
# from gym.envs.classic_control import rendering
# self.viewer = rendering.Viewer(screen_width, screen_height)
# l, r, t, b = -20 / 2, 20 / 2, 40 / 2, -40 / 2
# cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
# self.carttrans = rendering.Transform()
# cart.add_attr(self.carttrans)
# self.viewer.add_geom(cart)
# return self.viewer.render(return_rgb_array=mode == 'rgb_array')
return

def close(self):
self.save_monitoring_data()
if self.viewer:
self.viewer.close()
self.viewer = None

0 comments on commit 9027048

Please sign in to comment.