gsmithline · gsmithline · Apr 30, 2024 · May 2, 2024 · May 22, 2024 · May 22, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.MD b/README.MD
@@ -0,0 +1,31 @@
+# Final Project
+
+## Description
+This codebase is for my final project in CS 138. The project explores the Policy Space Response Oracle Framework in Normative Markov Games.
+
+## Versions
+- **Python version:** 3.10.9
+- **Numpy version:** 1.23.5
+- **Matplotlib version:** 3.7.0
+- **Scipy version:** 3.7.0
+- **Nashpy version:** 0.0.41
+- **Tensorflow:** 2.15.0
+- **Scikit-Learn:** 1.3.2
+- **Pytorch:** 2.3.0
+
+## Instruction for Setting up MA-Gym
+Follow the instructions [here](https://github.com/koulanurag/ma-gym) to set up the necessary packages for MA-Gym:
+- Focus only on the "Setup (important)" section.
+- After setup, navigate to the ma-gym directory and execute `pip install -e .`. This command installs all dependencies in the environment. Note: I made significant modifications to the MA-Gym codebase, hence its included in my submission.
+
+## Instructions to Use
+Run the experiments notebook to execute the experiments.
+
+### Notes on Running
+Key hyperparameters are defined at the top of `main.py`. Please adjust these as necessary. They are currently set based on my experiment configurations, which took approximately 4 hours in total to run.
+
+#### Hyperparameters
+- **episodes:** Controls the number of episodes in the training process.
+- **generations:** Specifies the number of generations in the PSRO simulation.
+- **max_steps:** Determines the maximum number of steps allowed after training the agent in the environment.
+- **runs:** Controls the number of simulation runs completed post-training.
diff --git a/RL_Final_Project.pdf b/RL_Final_Project.pdf
diff --git a/__pycache__/dqn.cpython-311.pyc b/__pycache__/dqn.cpython-311.pyc
diff --git a/__pycache__/main.cpython-311.pyc b/__pycache__/main.cpython-311.pyc
diff --git a/__pycache__/ppo.cpython-311.pyc b/__pycache__/ppo.cpython-311.pyc
diff --git a/dqn.py b/dqn.py
@@ -5,7 +5,9 @@
 import random
 import matplotlib.pyplot as plt
 
-
+'''
+Deep Q-Learning Network (DQN) Agent
+'''
 class DQNAgent:
     def __init__(self, state_size, action_size=2):
         self.state_size = state_size
@@ -19,7 +21,7 @@ def __init__(self, state_size, action_size=2):
         self.model = self._create()
         self.model_target = self._create()
         self.losses = []
-        self.history = None #array of tuples of actions and rewards
+        self.history = None 
         self.action = None 
 
     def _create(self):
@@ -55,22 +57,16 @@ def replay(self, batch_size=5):
             state = np.array(state, dtype=np.float32).reshape(1, self.state_size)
             next_state = np.array(next_state, dtype=np.float32).reshape(1, self.state_size)
 
-            # Calculate the target value for the current state
             if not done:
-                # Calculate the future discounted reward
                 target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
             else:
-                # If the state is terminal, then the reward is the final target
                 target = reward
 
-            # Fetch the current predictions for the state and update the predictions with the target
             target_f = self.model.predict(state)
             target_f[0][action[0]] = target
 
-            # Perform a single update on the model with the target values
             history = self.model.fit(state, target_f, epochs=1, verbose=0)
             self.losses.append(history.history["loss"][0]) #
-            # Decay epsilon after each replay step
             if self.epsilon > self.epsilon_min:
                 self.epsilon *= self.epsilon_decay