
#
# This file is part of the supplementary material for the paper
#               "Limits of End-to-End Learning"
# submitted to ACML'2017, written by the anonymous authors.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# For more information, please refer to <http://unlicense.org/>
#


from sys import argv
import math
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Activation, Flatten, Reshape, Lambda, add, concatenate, Convolution2D, MaxPooling2D, Dropout
from keras.optimizers import SGD
from keras.constraints import Constraint
from keras.callbacks import Callback, History, EarlyStopping, ModelCheckpoint
from keras import backend as K

# directions
dx = [0, 1, 0, -1]
dy = [-1, 0, 1, 0]

# create the RoboRally task
def createTask():
	# robo rally board
	# ground: ' '
	# wall: #
	# abyss: x
	# transport north: |
	# laser: l
	# goal: g
	board = [
		'#######',
		'#######',
		'# x####',
		'#  g###',
		'#l|####',
		'# |####',
		'#######',
	]

	T = 5      # action sequence length
	S = 32     # number of states, but only 24 of them are reachable
	s0 = 24    # initial state

	# index to board position
	pos = [
		(1, 2),
		(1, 3),
		(2, 3),
		(3, 3),   # goal
		(1, 4),
		(2, 4),
		(1, 5),   # start
		(2, 2),   # pit
	]

	# board position to index
	index = {
		(1, 2): 0,
		(2, 2): 7,
		(1, 3): 1,
		(2, 3): 2,
		(3, 3): 3,
		(1, 4): 4,
		(2, 4): 5,
		(3, 4): 7,
		(1, 5): 6,
	}

	P = np.zeros((S, 4, S))   # transition matrix, indexed by [s, a, s']
	R = np.zeros((S, 4))       # reward function, indexed by [s, a]
	for s in range(S):
		for a in range(4):
			x, y = pos[s // 4]
			dir = s % 4
			b = board[y][x]

			# execute action
			if b != 'x' and b != 'g':
				if a == 0:
					# more forward
					if board[y + dy[dir]][x + dx[dir]] != '#':
						x += dx[dir]
						y += dy[dir]
				elif a == 1:
					# turn left
					dir = (dir + 3) % 4
				elif a == 2:
					# turn right
					dir = (dir + 1) % 4

			# transport
			b = board[y][x]
			if b == '|':
				y -= 1
				b = board[y][x]

			# subsequent state
			s_prime = 4 * index[(x, y)] + dir

			# reward
			r = 0
			if a == 3:
				# penalty for inactivity
				r -= 1
			if b == 'x':
				# dead (terminal state)
				r -= 10
			elif b == 'l':
				# laser
				r -= 1
			elif b == 'g':
				# goal (terminal state)
				r += 10

			P[s, a, s_prime] = 1.0
			R[s, a] = r
	return board, P, R, T, S, s0


# board:               list of strings (lines)
# transition matrix P: P[s, a] is the (deterministic) successor state distribution
# reward function R:   R[s, a] is the reward
# time horizon T:      number of actions to take
# state space S:       number of states
# s0:                  initial state
board, P, R, T, S, s0 = createTask()


# helpers
def onehot(a, n):
	ret = np.zeros(n)
	ret[a] = 1.0
	return ret

def softmax(a):
	ret = np.exp(a)
	ret /= np.sum(ret)
	return ret


# Execute the "program" by propagating the distributions through P and R.
# The result captures the expectation over all execution paths.
def executeDistribution(commands):
	ret = [np.zeros((T, S)), np.zeros((T, 4)), np.zeros((T, S)), np.zeros((T, 1))]

	# initial state
	s = onehot(s0, S)

	# forward model, T times
	for t in range(T):
		a = commands[t, :]
		s_prime = np.dot(np.dot(np.transpose(P), s), a)
		r       = np.dot(np.dot(np.transpose(R), s), a)
		ret[0][t, :] = s
		ret[1][t, :] = a
		ret[2][t, :] = s_prime
		ret[3][t, 0] = r
		s = s_prime

	return ret

# Execute the "program" by sampling actions in each step.
# The result captures a single execution path.
def executeSample(commands):
	ret = [np.zeros((T, S)), np.zeros((T, 4)), np.zeros((T, S)), np.zeros((T, 1))]

	# initial state
	s = onehot(s0, S)

	# forward model, T times
	for t in range(T):
		prob = commands[t, :]
		z = np.random.rand()
		for a in range(4):
			z -= prob[a]
			if z <= 0:
				break
		a = onehot(a, 4)
		s_prime = np.dot(np.dot(np.transpose(P), s), a)
		r       = np.dot(np.dot(np.transpose(R), s), a)
		ret[0][t, :] = s
		ret[1][t, :] = a
		ret[2][t, :] = s_prime
		ret[3][t, 0] = r
		s = s_prime

	return ret


# minimize negative reward
def reward_loss(y_true, y_pred):
	return -y_pred

# output shape of the "sa" layer
def output_shape_sa(input_shape):
	shape = list(input_shape)
	assert len(shape) == 2
	shape[-1] = 4 * S
	return tuple(shape)

# probability simplex constraint for a dense layer:
# weights are non-negative, and the weights corresponding to each output node sum to one
class ForceProbability(Constraint):
	def __call__(self, p):
		p *= K.cast(p >= 0., K.floatx())
		p = p.dimshuffle(1, 0)
		p /= K.sum(p, axis=0)
		p = p.dimshuffle(1, 0)
		return p

# force weights to zero, e.g., disable the bias
class ForceZero(Constraint):
	def __call__(self, w):
		return K.zeros_like(w)

# custom function creating a dense layer, possibly with pre-defined weight matrices
def createDense(name, nIn, nOut, activation, weights = 'default'):
	if weights == 'default':
		return Dense(nOut, activation=activation, input_shape=(nIn,), name=name)
	elif weights == 'zero':
		return Dense(nOut, activation=activation, input_shape=(nIn,), name=name, weights=[np.zeros((nIn, nOut)), np.zeros((nOut,))])
	elif weights == 'probability':
		return Dense(nOut, activation=activation, input_shape=(nIn,), name=name, kernel_constraint=ForceProbability(), bias_constraint=ForceZero())
	else:
		return Dense(nOut, activation=activation, input_shape=(nIn,), name=name, weights=weights)

# initialize uniformly or optimally?
initOptimalActions = False

# what to train?
trainActions = True
trainMDP = True
pretrainMDP = False

# statistics over runs
runs_negative_reward = 0
runs_positive_reward = 0

# main loop running 100 independent experiments
for run in range(100):
	print("")
	print("run " + str(run + 1))
	print("")
	np.random.seed(42 + run)

	# create a network representing transition function and reward
	input_t = Input(shape=(T,))
	input_s = Input(shape=(S,))
	input_a = Input(shape=(4,))
	eye = np.eye(T)

	# layer l0 represents the action sequence
	# map one-hot encoded time to a distribution over actions
	if initOptimalActions:
		w = np.zeros((T, 4))
		b = np.zeros((4,))
		w[0, 2] = 30
		w[1, 0] = 30
		w[2, 0] = 30
		w[3, 0] = 30
		w[4, 3] = -30
		l0 = createDense('l0', T, 4, 'softmax', [w, b])
	else:
		l0 = createDense('l0', T, 4, 'softmax', 'zero')

	# policy model (only forward, no trainable parameters)
	l0.trainable = False
	policy = Model(inputs=(input_t), outputs=l0(input_t))
	policy.compile(loss=['categorical_crossentropy'], optimizer='adadelta')

	# layer l1 computes joint state+action probabilities
	# map distributions over s and a to the joint distribution
	l1 = Lambda(lambda x: K.reshape(K.batch_dot(K.reshape(x[:,0:S], (x.shape[0],S,1)), K.reshape(x[:,S:S+4], (x.shape[0],4,1)), axes=2), (x.shape[0], 4*S)), output_shape=output_shape_sa)

	# layer l2 computes the successor state distribution
	# map the joint distribution over s and a to a distribution over s'
	if trainMDP:
		l2 = createDense('l2', 4*S, S, 'linear', 'probability')
	else:
		w = np.zeros((4*S, S))
		for s in range(S):
			for a in range(4):
				for s_prime in range(S):
					w[4*s+a, s_prime] = P[s, a, s_prime]
		b = np.zeros((S,))
		l2 = createDense('l2', 4*S, S, 'linear', [w, b])
		l2.trainable = False

	# layer l3 computes the reward
	# map the joint distribution over s and a to the expected reward
	if trainMDP:
		l3 = createDense('l3', 4*S, 1, 'linear', 'zero')
	else:
		w = np.zeros((4*S, 1))
		b = np.zeros((1,))
		for s in range(S):
			for a in range(4):
				w[4*s+a] = R[s,a]
		l3 = createDense('l3', 4*S, 1, 'linear', [w, b])
		l3.trainable = False

	# MDP model
	sa = l1(concatenate([input_s, input_a]))
	output_s_prime = l2(sa)
	output_r = l3(sa)
	mdp = Model(inputs=[input_s, input_a], outputs=[output_s_prime, output_r])
	mdp.compile(loss=['mean_squared_error', 'mean_squared_error'], optimizer=SGD(lr=0.5))

	# planner model
	l0.trainable = trainActions
	l1.trainable = False
	l2.trainable = False
	l3.trainable = False
	s = Input(shape=(S,))        # state layer
	inputlayers = [s]
	inputdata = [onehot(s0, S).reshape(1, S)]   # initial state
	r = None
	for t_ in range(T):
		# "time input"
		t = Input(shape=(T,))
		inputlayers.append(t)
		inputdata.append(eye[t_,:].reshape(1, T))
		a = l0(t)
		sa = l1(concatenate([s, a]))
		s = l2(sa)
		immediate_r = l3(sa)
		if (r == None):
			r = immediate_r
		else:
			r = add([r, immediate_r])
	planner = Model(inputs=inputlayers, outputs=r)
	planner.compile(loss=reward_loss, optimizer='adadelta')

	if pretrainMDP:
		print('MDP pre-training')
		assert T == 5
		x0 = np.zeros((4*S, S))
		x1 = np.zeros((4*S, 4))
		x2 = np.zeros((4*S, S))
		x3 = np.zeros((4*S, 1))
		for s in range(S):
			for a in range(4):
				i = 4 * s + a
				x0[i] = onehot(s, S)
				x1[i] = onehot(a, 4)
				x2[i] = np.dot(np.dot(np.transpose(P), x0[i]), x1[i])
				x3[i] = np.dot(np.dot(np.transpose(R), x0[i]), x1[i])
		mdp.fit([x0, x1], [x2, x3], epochs=10000, batch_size=4*S, verbose=0)

	# interleaved training of MDP and planner
	for iter in range(10000):
		# train the MDP model
		A = policy.predict(eye, batch_size=T)
		episode = executeSample(A)
		mdp.fit([episode[0], episode[1]], [episode[2], episode[3]], epochs=1, batch_size=T, verbose=0)

		# train the planner model
		planner.fit(inputdata, np.zeros((1,)), epochs=1, verbose=0)

	# report
	A = policy.predict(eye, batch_size=T)
	print('policy:')
	print(A)

	episode = executeDistribution(A)
	reward = np.sum(np.sum(episode[3]))
	print('total reward: ' + str(reward))

	if reward <= 0:
		runs_negative_reward += 1
	else:
		runs_positive_reward += 1

# overall evaluation
print('# runs with negative reward: ' + str(runs_negative_reward))
print('# runs with positive reward: ' + str(runs_positive_reward))
