-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhive_udf.cson
215 lines (160 loc) · 6.57 KB
/
hive_udf.cson
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
'.source.SQL':
hive_udf.castarray 将array 中的值转换成指定的基础数据类型
select sort_array(hive_udf.castarray(split(truck_length,','),'double'))[0];
hive_udf.arrayintersect
-- 交集自带去重
-- 针对 array<struct> 有缺陷。 只适合于简单的array,int ,string;
select arrayintersect(a,b) from (
select split('a,b,c,d',',') as a, split('b,c,d,e,b',',') as b from dim.dim_city limit 1) AA
hive_udf.collect_where
_FUNC_(value, condition) - aggregate the values which satisfy the condition into an array
select collect_where(id,prov_id=110000)
,collect_where(id,prov_id=120000)
,collect_where(id,prov_id=130000)
from (
select id,prov_id from dim.dim_city where deep in (1,2) and id >=110000 and id <150000
) A
hive_udf.count_where
# Compute a COUNT of row items in which a condition is true;
# "true" means "not false and also not null."
# This sounds like it is the same as COUNT(1) with a WHERE or GROUP BY, but
# it allows for multiple columns to be tracked separately within a single
# aggregation. This is faster and cleaner than
# SUM(CAST(example = foo AS INT)), and also appropriately returns zero
# when the item in question is NULL, unlike COUNT(1) which doesn't know what
# is and isn't NULL.
select count_where(prov_id=110000),count_where(prov_id=120000),count_where(prov_id=130000) from (
select id,prov_id from dim.dim_city where deep in (1,2) and id >=110000 and id <150000
) A
hive_udf.combine_unique
# Aggregate function to combine several lists together to return a list of unique values
select type,hive_udf.combine_unique(aa) from (
select split('a,b,d',',') aa ,1 type from dim.dim_city limit 1
union all
select split('a,b,c',',') aa,1 type from dim.dim_city limit 1) AA
group by type;
select hive_udf.combine_unique(split('a,b,d,a',',') )
from dim.dim_city limit 1
– ["d","b","a"]
hive_udf.sum_where
# Compute the SUM of row items for which a condition is true;
# "true" means "not false and also not null."
select sum_where(id,prov_id=110000),collect_where(id,prov_id=120000),count_where(prov_id=130000) from (
select id,prov_id from dim.dim_city where deep in (1,2) and id >=110000 and id <150000
) A
-- double
hive_udf.udaf_to_order_map
select prov_id,hive_udf.udaf_to_order_map(name,id) map_set
from
dim.dim_city where id>=1 and id <200000
group by prov_id
return
110000 {"东城区":110101,"丰台区":110106,"北京市":110100,"大兴区":110115,"宣武区":110104}
hive_udf.map_remove_keys
select map('a',4,'b',5,'c',4),hive_udf.map_remove_keys(map('a',4,'b',5,'c',4),array('a'))
return
{"a":4,"b":5,"c":4} {"b":5,"c":4}
hive_udf.json_as_map
select id,hive_udf.json_as_map(k)["a"]
from
(
select id,hive_udf.json_as_array('[{"a":1},{"a":2}]') as json_as_array from (select 1 as id from (select 0) t) t
)t
lateral view explode(json_as_array) tf as k;
hive_udf.json_to_map
select hive_udf.json_to_map ('{"a":4,"b":5,"c":4}')['a']
select id,k,v
from
(
select id,'{"a":4,"b":5,"c":4}' map_json
from dim.dim_city limit 1
)t
lateral view explode(hive_udf.json_as_map (map_json)) tf as k,v;
hive_udf.json_as_map
select hive_udf.json_as_map ("{'a':4,'b':5,'c':4}")['a']
select hive_udf.json_as_map ('{"a":4,"b":5,"c":4}')['a']
select id,k,v
from
(
select id,'{"a":4,"b":5,"c":4}' map_json
from dim.dim_city limit 1
)t
lateral view explode(hive_udf.json_as_map (map_json)) tf as k,v;
hive_udf.daydiff
note : 参数不能等于0
select hive_udf.daydiff(20180525,20180504) from dim.dim_city limit 1;
hive_udf.arrayunion
select hive_udf.arrayunion(split('a,b,d',',') ,split('a,b,c',',')) from dim.dim_city limit 1;
return ["d","b","c","a"]
select concat_ws(',',hive_udf.arrayunion(split('a,b,d',',') ,split('a,b,c',','))) from dim.dim_city limit 1;
return a,b,c,d
hive_udf.udtf_explode_map
select hive_udf.udtf_explode_map(hive_udf.json_as_map ("{'a':4,'b':5,'c':4}") )
group_longest
select group_longest(short_name),group_longest(area)
from ( select area,short_name from dim.dim_city where deep in (1,2) and id >=110000 and id <150000) AA
-- bug 当string 长度一致时,返回其中一个
select "dwd.web_log_monitor_delta_daily_hourly"
,$date
,collect_set(cast(t1.col as string))
from
(select day,hour
from dwd.web_log_monitor_delta_daily_hourly
where day=20181031
group by day, hour ) t
full join (select explode(array(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)) as col ) t1
on t.hour = t1.col
where t.hour is null
group by "dwd.web_log_monitor_delta_daily_hourly",
$date;
--
select * from (
select tf.col from (select 0) t lateral view explode(array('A','B','C')) tf as col ) A
where array_contains(Array('A','b'),col) = TRUE
hive_udf.pmax -- 返回行中最大值 DOUBLE
select hive_udf.pmax(1,6,'a') from (select 0 ) A -- 6
select cast(hive_udf.pmax(1,20190123) as int ) -- 须加cast 进行转换为 int
select hive_udf.pmax('a') -- null
select sort_array(array(1,'',' ','汉字',null,'null')) from from (select 0 ) A
WHERE sort_array(array(1,'',' ','汉字',null,'null'))[0] is null
-- [null,""," ","1","null","汉子"]
```
{"country":"US","page":227,"data":{"ad":{"impressions":{"s":10,"o":10}}}}
select v1.Country, v1.Page, v4.impressions_s, v4.impressions_o
from hive_parsing_json_table hpjp
LATERAL VIEW json_tuple(hpjp.json, 'country', 'page', 'data') v1
as Country, Page, data
LATERAL VIEW json_tuple(v1.data, 'ad') v2
as Ad
LATERAL VIEW json_tuple(v2.Ad, 'impressions') v3
as Impressions
LATERAL VIEW json_tuple(v3.Impressions, 's' , 'o') v4
as impressions_s,impressions_o;
```
``` SQL
-- collect_list 有序
SELECT -- concat_ws(',',collect_set(enum_value)) over ( order by enum_key desc )
-- ,
concat_ws(',',collect_list(enum_val) over ( order by enum_key desc rows between unbounded preceding and unbounded following))
from dw.dim_enums
```
```sql
select time ,user_id,search_method,
group_flag,
FIRST_VALUE(search_method) OVER(PARTITION BY group_flag ORDER BY time) AS last_search_method
from (
SELECT time,user_id,search_method,
row_number() OVER(PARTITION BY user_id ORDER BY time) AS rn,
sum(if(search_method ='', 1,0)) OVER(PARTITION BY user_id ORDER BY time) AS sum_cnt,
row_number() OVER(PARTITION BY user_id ORDER BY time) -sum(if(search_method ='', 1,0)) OVER(PARTITION BY user_id ORDER BY time) as group_flag
FROM oyo_behavior_dw.dw_sensor_events_data
where date_time = '2019-07-29'
and user_id = '828598249005366892'
order by `time` ) A
```
```SQL
-- 匹配版本号
select tb.*,regexp_extract(array_string, '(\\d+(\\.+\\d){1,})',1) from (
select array('Android 5.0.1 6.3','Android 5.3','8.1.3') as array_strings
from (select 1) a ) A LATERAL VIEW explode(array_strings) tb as array_string;
```