This repository has been archived by the owner on Aug 14, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrockSampleParticleLB.jl
163 lines (144 loc) · 6.1 KB
/
rockSampleParticleLB.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# A RockSample lower bound estimation procedure for solvers that use
# weighted particle belief representation (defined in types.jl)
# TODO: think how to make this more generic
#
# Strategy: Compute a representative state by setting the state of
# each rock to the one that occurs more frequently in the particle set.
# Then compute the best sequence of actions for the resulting
# state. Apply this sequence of actions to each particle and average
# to get a lower bound.
#
# Possible improvement: If a rock is sampled while replaying the action
# sequence, use dynamic programming to look forward in the action
# sequence to determine if it would be a better idea to first sense the
# rock instead. (sensing eliminates the bad rocks in the particle set)
mutable struct RockSampleParticleLB
weight_sum_of_state::Vector{Float64}
best_action::RockSampleAction
function RockSampleParticleLB(pomdp::RockSample)
this = new()
this.weight_sum_of_state = Array{Float64}(pomdp.n_states)
fill!(this.weight_sum_of_state, -Inf)
this.best_action = RockSampleAction(-1)
return this
end
end
function init_bound(lb::RockSampleParticleLB,
pomdp::RockSample,
config::DESPOTConfig)
# nothing to do for now
end
function lower_bound(lb::RockSampleParticleLB,
pomdp::RockSample,
particles::Vector{DESPOTParticle{RockSampleState}},
ub_actions::Vector{RockSampleAction},
config::DESPOTConfig)
weight_sum::Float64 = 0.0
state_seen::Dict{Int64,Int64} = Dict{Int64,Int64}()
s_index::Int64 = -1
# Since for this problem the cell that the rover is in is deterministic, picking pretty much
# any particle state is ok
if length(particles) > 0
if isterminal(pomdp, particles[1].state)
return 0.0 # lower bound value and best action
end
end
# The expected value of sampling a rock, over all particles
expected_sampling_value::Array{Float64} = fill(0.0, pomdp.n_rocks)
seen_ptr::Int64 = 0
# Compute the expected sampling value of each rock. Instead of factoring
# the weight of each particle, we first record the weight of each state.
# This is so that the inner loop that updates the expected value of each
# rock runs once per state seen, instead of once per particle seen. If
# there are lots of common states between particles, this gives a
# significant speedup to the search because the lower bound is the
# bottleneck.
for p in particles
if lb.weight_sum_of_state[state_index(pomdp,p.state)+1] == -Inf #Array
lb.weight_sum_of_state[state_index(pomdp,p.state)+1] = p.weight
state_seen[seen_ptr] = state_index(pomdp,p.state)
seen_ptr += 1
else
lb.weight_sum_of_state[state_index(pomdp,p.state)+1] += p.weight;
end
end
for i in 0:(seen_ptr-1)
s_index = state_seen[i]
weight_sum += lb.weight_sum_of_state[s_index+1]
for j in 0:pomdp.n_rocks-1
expected_sampling_value[j+1] += lb.weight_sum_of_state[s_index+1] * (rock_status(j, s_index) ? 10.0 : -10.0)
end
end
# Reset for next use
fill!(lb.weight_sum_of_state, -Inf)
most_likely_rock_set = 0
for i in 0:pomdp.n_rocks-1
expected_sampling_value[i+1] /= weight_sum
# Threshold the average to good or bad
if expected_sampling_value[i+1] > -config.tiny
most_likely_rock_set |= (1 << i)
end
if abs(expected_sampling_value[i+1]) < config.tiny
expected_sampling_value[i+1] = 0.0
end
end
# Since for this problem the cell that the rover is in is deterministic, picking pretty much
# any particle state is ok
s = RockSampleState(
make_state_index(pomdp, cell_of(pomdp, particles[1].state), most_likely_rock_set))
# Sequence of actions taken in the optimal policy
optimal_policy = Vector{RockSampleAction}()
ret::Float64 = 0.0
reward::Float64 = 0.0
prev_cell_coord = [0,0] # initial value - should cause error if not properly assigned
next_s::RockSampleState = RockSampleState()
r::Float64 = 0.0
trans_distribution = create_transition_distribution(pomdp)
rng = DESPOTRandomNumber(0) # dummy RNG
while true
a = ub_actions[state_index(pomdp,s)+1]
trans_distribution.state = s
trans_distribution.action = a
next_s = POMDPs.rand(rng, trans_distribution, next_s)
if isterminal(pomdp, next_s)
prev_cell_coord[1] = pomdp.cell_to_coords[cell_of(pomdp, s)+1][1]
prev_cell_coord[2] = pomdp.cell_to_coords[cell_of(pomdp, s)+1][2]
ret = 10.0
break
end
push!(optimal_policy,a)
if length(optimal_policy) == config.search_depth
prev_cell_coord[1] = pomdp.cell_to_coords[cell_of(pomdp, next_s)+1][1]
prev_cell_coord[2] = pomdp.cell_to_coords[cell_of(pomdp, next_s)+1][2]
ret = 0.0
break
end
s = next_s #TODO: deepcopy?
end
lb.best_action = (length(optimal_policy) == 0) ? RockSampleAction(2) : optimal_policy[1]
# Execute the sequence backwards to allow using the DP trick
for i = length(optimal_policy):-1:1
act = optimal_policy[i]
ret *= pomdp.discount
if action_index(pomdp,act) == 4
rock = pomdp.rock_at_cell[cell_num(pomdp, prev_cell_coord[1], prev_cell_coord[2])+1]
if rock != -1
ret = expected_sampling_value[rock+1] + ret # expected sampling value is an array
end
continue
end
# Move in the opposite direction since we're going backwards
if action_index(pomdp, act) == 0
prev_cell_coord[1] += 1
elseif action_index(pomdp, act) == 1
prev_cell_coord[1] -= 1
elseif action_index(pomdp, act) == 2
prev_cell_coord[2] -= 1
elseif action_index(pomdp, act) == 3
prev_cell_coord[2] += 1
else
@assert(false)
end
end
return ret
end