001 package aima.learning.reinforcement; 002 003 import aima.probability.decision.MDP; 004 import aima.probability.decision.MDPPerception; 005 import aima.probability.decision.MDPPolicy; 006 import aima.probability.decision.MDPUtilityFunction; 007 import aima.util.FrequencyCounter; 008 009 /** 010 * @author Ravi Mohan 011 * 012 */ 013 014 public class PassiveTDAgent<STATE_TYPE, ACTION_TYPE> extends 015 MDPAgent<STATE_TYPE, ACTION_TYPE> { 016 017 private MDPPolicy<STATE_TYPE, ACTION_TYPE> policy; 018 019 private MDPUtilityFunction<STATE_TYPE> utilityFunction; 020 021 // private Hashtable<STATE_TYPE,Double> stateCount; 022 private FrequencyCounter<STATE_TYPE> stateCount; 023 024 private Double previousReward; 025 026 public PassiveTDAgent(MDP<STATE_TYPE, ACTION_TYPE> mdp, 027 MDPPolicy<STATE_TYPE, ACTION_TYPE> policy) { 028 super(mdp.emptyMdp()); 029 this.policy = policy; 030 this.utilityFunction = new MDPUtilityFunction<STATE_TYPE>(); 031 this.stateCount = new FrequencyCounter<STATE_TYPE>(); 032 } 033 034 @Override 035 public ACTION_TYPE decideAction(MDPPerception<STATE_TYPE> perception) { 036 037 if (!(utilityFunction.hasUtilityFor(perception.getState()))) { // if 038 // perceptionState 039 // is 040 // new 041 utilityFunction.setUtility(perception.getState(), perception 042 .getReward()); 043 mdp.setReward(perception.getState(), perception.getReward()); 044 } 045 if (!(previousState == null)) { 046 stateCount.incrementFor(previousState); 047 utilityFunction = updateUtilityFunction(1.0); 048 } 049 050 if (mdp.isTerminalState(currentState)) { 051 previousState = null; 052 previousAction = null; 053 previousReward = null; 054 } else { 055 previousState = currentState; 056 previousAction = policy.getAction(currentState); 057 previousReward = currentReward; 058 } 059 return previousAction; 060 } 061 062 private MDPUtilityFunction<STATE_TYPE> updateUtilityFunction(double gamma) { 063 MDPUtilityFunction<STATE_TYPE> uf = utilityFunction.copy(); 064 double u_s = utilityFunction.getUtility(previousState); 065 double gammaUtilDIff = ((gamma * utilityFunction 066 .getUtility(currentState)) - utilityFunction 067 .getUtility(previousState)); 068 double alphaTerm = stateCount.probabilityOf(previousState) 069 * (previousReward + gammaUtilDIff); 070 uf.setUtility(previousState, u_s + alphaTerm); 071 return uf; 072 } 073 074 public MDPUtilityFunction<STATE_TYPE> getUtilityFunction() { 075 076 return utilityFunction; 077 } 078 }