@JesseMalnik/

Neural Net 3

Python

No description

fork
loading
Files
  • main.py
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import tensorflow as tf
import gym

class PPO(object):

  #init  
  def __init__(self):
    self.sess = tf.Session()
    self.tfs = tf.placeholder(tf.float32, [NONE, S_DIM], 'state')

    #critic (value network)
    with tf.variable_scope('critic')
      l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
      #Value
      self.v = tf.layers(l1, 1)
      #discounted reward
      self.tfdc_r = tf.placeholder(float32, [NONE, 1], 'discounted_r')
      self.advantage = self.tfdc_r - self.v
      self.closs = tf.reduce_mean(tf.square(self.advantage))
      self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)


    #Actor (policy network)
    pi, pi_params = self.__build_anet('pi', trainable=True)
    oldpi, oldpi_params = self.__build_anet('oldpi',trainable=False)

    #sample from oldpiand newpi networks
    with tf.variable_scope('sample_action'):
      self.sample_op = tf.squeeze(pi.sample(1), axis=0) #choose action
    with tf.variable_scope('update_oldpi'):
      self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]

      #placeholders for action and advantage

      self.tfa = tf.placeholder(tf.float32, [NONE, A_DIM], 'action')
      self.tfadv = tf.placeholder(tf.float32, [NONE, 1], 'advantage')

      #loss funcction
      with tf.variable_scope('loss'):
        with tf.variable_scope('surrogate'):
          #calc a ratio between action and oldpi action
          ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
          #multiply by advantage
          surr = ratio * self.tfadv

        #KL penalty