-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathgpuSplitVertices.h
139 lines (114 loc) · 3.9 KB
/
gpuSplitVertices.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
#include <algorithm>
#include <cmath>
#include <cstdint>
#include "CUDACore/HistoContainer.h"
#include "CUDACore/cuda_assert.h"
#include "gpuVertexFinder.h"
namespace gpuVertexFinder {
__device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
constexpr bool verbose = false; // in principle the compiler should optmize out if false
auto& __restrict__ data = *pdata;
auto& __restrict__ ws = *pws;
auto nt = ws.ntrks;
float const* __restrict__ zt = ws.zt;
float const* __restrict__ ezt2 = ws.ezt2;
float* __restrict__ zv = data.zv;
float* __restrict__ wv = data.wv;
float const* __restrict__ chi2 = data.chi2;
uint32_t& nvFinal = data.nvFinal;
int32_t const* __restrict__ nn = data.ndof;
int32_t* __restrict__ iv = ws.iv;
assert(pdata);
assert(zt);
// one vertex per block
for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
if (nn[kv] < 4)
continue;
if (chi2[kv] < maxChi2 * float(nn[kv]))
continue;
constexpr int MAXTK = 512;
assert(nn[kv] < MAXTK);
if (nn[kv] >= MAXTK)
continue; // too bad FIXME
__shared__ uint32_t it[MAXTK]; // track index
__shared__ float zz[MAXTK]; // z pos
__shared__ uint8_t newV[MAXTK]; // 0 or 1
__shared__ float ww[MAXTK]; // z weight
__shared__ uint32_t nq; // number of track for this vertex
nq = 0;
__syncthreads();
// copy to local
for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
if (iv[k] == int(kv)) {
auto old = atomicInc(&nq, MAXTK);
zz[old] = zt[k] - zv[kv];
newV[old] = zz[old] < 0 ? 0 : 1;
ww[old] = 1.f / ezt2[k];
it[old] = k;
}
}
__shared__ float znew[2], wnew[2]; // the new vertices
__syncthreads();
assert(int(nq) == nn[kv] + 1);
int maxiter = 20;
// kt-min....
bool more = true;
while (__syncthreads_or(more)) {
more = false;
if (0 == threadIdx.x) {
znew[0] = 0;
znew[1] = 0;
wnew[0] = 0;
wnew[1] = 0;
}
__syncthreads();
for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
auto i = newV[k];
atomicAdd(&znew[i], zz[k] * ww[k]);
atomicAdd(&wnew[i], ww[k]);
}
__syncthreads();
if (0 == threadIdx.x) {
znew[0] /= wnew[0];
znew[1] /= wnew[1];
}
__syncthreads();
for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
auto d0 = fabs(zz[k] - znew[0]);
auto d1 = fabs(zz[k] - znew[1]);
auto newer = d0 < d1 ? 0 : 1;
more |= newer != newV[k];
newV[k] = newer;
}
--maxiter;
if (maxiter <= 0)
more = false;
}
// avoid empty vertices
if (0 == wnew[0] || 0 == wnew[1])
continue;
// quality cut
auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
if (verbose && 0 == threadIdx.x)
printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
if (chi2Dist < 4)
continue;
// get a new global vertex
__shared__ uint32_t igv;
if (0 == threadIdx.x)
igv = atomicAdd(&ws.nvIntermediate, 1);
__syncthreads();
for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
if (1 == newV[k])
iv[it[k]] = igv;
}
} // loop on vertices
}
__global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
splitVertices(pdata, pws, maxChi2);
}
} // namespace gpuVertexFinder
#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h