-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathgpuFitVertices.h
113 lines (93 loc) · 3.01 KB
/
gpuFitVertices.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
#include <algorithm>
#include <cmath>
#include <cstdint>
#include "CUDACore/HistoContainer.h"
#include "CUDACore/cuda_assert.h"
#include "gpuVertexFinder.h"
namespace gpuVertexFinder {
__device__ __forceinline__ void fitVertices(ZVertices* pdata,
WorkSpace* pws,
float chi2Max // for outlier rejection
) {
constexpr bool verbose = false; // in principle the compiler should optmize out if false
auto& __restrict__ data = *pdata;
auto& __restrict__ ws = *pws;
auto nt = ws.ntrks;
float const* __restrict__ zt = ws.zt;
float const* __restrict__ ezt2 = ws.ezt2;
float* __restrict__ zv = data.zv;
float* __restrict__ wv = data.wv;
float* __restrict__ chi2 = data.chi2;
uint32_t& nvFinal = data.nvFinal;
uint32_t& nvIntermediate = ws.nvIntermediate;
int32_t* __restrict__ nn = data.ndof;
int32_t* __restrict__ iv = ws.iv;
assert(pdata);
assert(zt);
assert(nvFinal <= nvIntermediate);
nvFinal = nvIntermediate;
auto foundClusters = nvFinal;
// zero
for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
zv[i] = 0;
wv[i] = 0;
chi2[i] = 0;
}
// only for test
__shared__ int noise;
if (verbose && 0 == threadIdx.x)
noise = 0;
__syncthreads();
// compute cluster location
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] > 9990) {
if (verbose)
atomicAdd(&noise, 1);
continue;
}
assert(iv[i] >= 0);
assert(iv[i] < int(foundClusters));
auto w = 1.f / ezt2[i];
atomicAdd(&zv[iv[i]], zt[i] * w);
atomicAdd(&wv[iv[i]], w);
}
__syncthreads();
// reuse nn
for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
assert(wv[i] > 0.f);
zv[i] /= wv[i];
nn[i] = -1; // ndof
}
__syncthreads();
// compute chi2
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] > 9990)
continue;
auto c2 = zv[iv[i]] - zt[i];
c2 *= c2 / ezt2[i];
if (c2 > chi2Max) {
iv[i] = 9999;
continue;
}
atomicAdd(&chi2[iv[i]], c2);
atomicAdd(&nn[iv[i]], 1);
}
__syncthreads();
for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
if (nn[i] > 0)
wv[i] *= float(nn[i]) / chi2[i];
if (verbose && 0 == threadIdx.x)
printf("found %d proto clusters ", foundClusters);
if (verbose && 0 == threadIdx.x)
printf("and %d noise\n", noise);
}
__global__ void fitVerticesKernel(ZVertices* pdata,
WorkSpace* pws,
float chi2Max // for outlier rejection
) {
fitVertices(pdata, pws, chi2Max);
}
} // namespace gpuVertexFinder
#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h