001    package aima.learning.reinforcement;
002    
003    import aima.probability.decision.MDP;
004    import aima.probability.decision.MDPPerception;
005    import aima.probability.decision.MDPPolicy;
006    import aima.probability.decision.MDPUtilityFunction;
007    import aima.util.FrequencyCounter;
008    
009    /**
010     * @author Ravi Mohan
011     * 
012     */
013    
014    public class PassiveTDAgent<STATE_TYPE, ACTION_TYPE> extends
015                    MDPAgent<STATE_TYPE, ACTION_TYPE> {
016    
017            private MDPPolicy<STATE_TYPE, ACTION_TYPE> policy;
018    
019            private MDPUtilityFunction<STATE_TYPE> utilityFunction;
020    
021            // private Hashtable<STATE_TYPE,Double> stateCount;
022            private FrequencyCounter<STATE_TYPE> stateCount;
023    
024            private Double previousReward;
025    
026            public PassiveTDAgent(MDP<STATE_TYPE, ACTION_TYPE> mdp,
027                            MDPPolicy<STATE_TYPE, ACTION_TYPE> policy) {
028                    super(mdp.emptyMdp());
029                    this.policy = policy;
030                    this.utilityFunction = new MDPUtilityFunction<STATE_TYPE>();
031                    this.stateCount = new FrequencyCounter<STATE_TYPE>();
032            }
033    
034            @Override
035            public ACTION_TYPE decideAction(MDPPerception<STATE_TYPE> perception) {
036    
037                    if (!(utilityFunction.hasUtilityFor(perception.getState()))) { // if
038                            // perceptionState
039                            // is
040                            // new
041                            utilityFunction.setUtility(perception.getState(), perception
042                                            .getReward());
043                            mdp.setReward(perception.getState(), perception.getReward());
044                    }
045                    if (!(previousState == null)) {
046                            stateCount.incrementFor(previousState);
047                            utilityFunction = updateUtilityFunction(1.0);
048                    }
049    
050                    if (mdp.isTerminalState(currentState)) {
051                            previousState = null;
052                            previousAction = null;
053                            previousReward = null;
054                    } else {
055                            previousState = currentState;
056                            previousAction = policy.getAction(currentState);
057                            previousReward = currentReward;
058                    }
059                    return previousAction;
060            }
061    
062            private MDPUtilityFunction<STATE_TYPE> updateUtilityFunction(double gamma) {
063                    MDPUtilityFunction<STATE_TYPE> uf = utilityFunction.copy();
064                    double u_s = utilityFunction.getUtility(previousState);
065                    double gammaUtilDIff = ((gamma * utilityFunction
066                                    .getUtility(currentState)) - utilityFunction
067                                    .getUtility(previousState));
068                    double alphaTerm = stateCount.probabilityOf(previousState)
069                                    * (previousReward + gammaUtilDIff);
070                    uf.setUtility(previousState, u_s + alphaTerm);
071                    return uf;
072            }
073    
074            public MDPUtilityFunction<STATE_TYPE> getUtilityFunction() {
075    
076                    return utilityFunction;
077            }
078    }