-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathretract.py
executable file
·199 lines (175 loc) · 5.43 KB
/
retract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Stephen Po-Chedley 19 August 2020
Script to remove xmls linked to retracted data.
@author: pochedls
"""
import os
import sqlite3
import fx
import datetime
import time
import glob
# define convenience function
def execQuery(sqlDb, q):
conn = sqlite3.connect(sqlDb)
c = conn.cursor()
cursor = c.execute(q)
x = cursor.fetchall()
names = list(map(lambda x: x[0], cursor.description))
conn.close()
return names, x
# specify databases
sqlDb = '/p/css03/painter/db/sdt6.db'
xaggDb = 'xml.db'
cmipMeta = 'data/cmipMeta.pkl'
retractDir = '/p/user_pub/xclim/retracted/'
testDir = '/p/user_pub/xclim/CMIP6/CMIP/amip/atmos/mon/tas/'
# check if xagg is mounted
files = glob.glob(testDir + '*.xml')
if len(files) < 100:
raise ValueError('It appears a disk is not mounted.')
# Ensure there isn't a concurrent run or unresolved error
# If there is no lock, place a lock and continue
if fx.runLock('check'):
raise ValueError('Lock is on. xagg is running or encountered an error.')
else:
fx.runLock('on')
# get retracted files
print('Get retracted files')
print(time.ctime())
print()
q = "SELECT * FROM dataset WHERE status LIKE '%retracted';"
names, retractedSet = execQuery(sqlDb, q)
# get all paths, keys, and xmls
print('Get xagg files')
print(time.ctime())
print()
q = 'select keyid, path, xmlfile from paths where retired = 0 and ignored = 0;'
x, allKeys = execQuery(xaggDb, q)
xaggKeys = {}
for row in allKeys:
key = row[0]
path = row[1]
xmlfile = row[2]
if key not in xaggKeys.keys():
xaggKeys[key] = {}
xaggKeys[key][path] = xmlfile
else:
xaggKeys[key][path] = xmlfile
# get xagg paths that are retracted
print('Get xagg retracted files')
print(time.ctime())
print()
q = 'select keyid, path, xmlfile from paths where ignored = 1 and error = "retracted";'
x, allKeys = execQuery(xaggDb, q)
xaggRetractedKeys = {}
for row in allKeys:
key = row[0]
path = row[1]
xmlfile = row[2]
if key not in xaggRetractedKeys.keys():
xaggRetractedKeys[key] = {}
xaggRetractedKeys[key][path] = xmlfile
else:
xaggRetractedKeys[key][path] = xmlfile
# get paths that need to be retracted
print('Get paths to ignore')
print(time.ctime())
print()
retractList = []
deleteList = {}
datalist = []
datalistXml = []
esgfRetractedKeys = []
for i, row in enumerate(retractedSet):
# get metadata
meta = row[1].split('.')
mip = meta[0]
activity = meta[1]
institution = meta[2]
model = meta[3]
experiment = meta[4]
realization = meta[5]
table = meta[6]
variableId = meta[7]
grid = meta[8]
version = meta[9]
gridLabel = '*'
try:
frequency, realm, dimensions = fx.lookupCMIPMetadata(mip, table,
variableId,
dictObj=cmipMeta)
gridLabel = fx.createGridLabel(mip, realm, table,
grid, dimensions)
except:
realm = 'unk'
frequency = 'unk'
gridLabel = 'unk'
# create key
key = [mip, activity, institution, model, experiment, realization, table,
realm, frequency, variableId, grid, gridLabel, version]
key = '.'.join(key)
esgfRetractedKeys.append(key)
if key in xaggKeys.keys():
for rpath in xaggKeys[key].keys():
fn = xaggKeys[key][rpath]
ignoretime = fx.toSQLtime(datetime.datetime.now())
if fn is None:
datalist.append([None, None, 'retracted', 1, ignoretime, rpath])
else:
xfnn = retractDir + fn.split('/')[-1]
deleteList[fn] = xfnn
datalistXml.append([xfnn, 'retracted', 1, ignoretime, rpath])
print('Retract files')
print(time.ctime())
print()
# Ignore paths without an xml file
columns = ['xmlFile', 'xmlwritedatetime', 'error', 'ignored', 'ignored_datetime']
fx.sqlUpdate(xaggDb, 'paths', columns, 'path', datalist)
# Update paths with an xml file
columnsXml = ['xmlFile', 'error', 'ignored', 'ignored_datetime']
fx.sqlUpdate(xaggDb, 'paths', columnsXml, 'path', datalistXml)
print('Archive files')
print(time.ctime())
print()
deleteCount = 0
for fn in deleteList:
if os.path.exists(fn):
xfnn = deleteList[fn]
os.rename(fn, xfnn)
deleteCount += 1
# check for files that need to be un-retracted
print('Find data that should be un-retracted')
print(time.ctime())
print()
esgfRetractedKeys = set(esgfRetractedKeys)
deleteList = []
unretract = []
unretractCount = 0
for key in xaggRetractedKeys.keys():
if key not in esgfRetractedKeys:
dpaths = list(xaggRetractedKeys[key].keys())
for dpath in dpaths:
fn = xaggRetractedKeys[key][dpath]
unretract.append([None, None, None, 0, None, dpath])
unretractCount += 1
if fn is not None:
deleteList.append(fn)
# mark paths as not retracted
print('Un-retract data')
print(time.ctime())
print()
columns = ['xmlFile', 'xmlwritedatetime', 'error', 'ignored', 'ignored_datetime']
fx.sqlUpdate(xaggDb, 'paths', columns, 'path', unretract)
# remove xmls
for fn in deleteList:
if os.path.exists(fn):
os.remove(fn)
print('Ignored ' + str(len(datalist)) + ' paths')
print('Archived ' + str(deleteCount) + ' xml files')
print('Un-ignored ' + str(unretractCount) + ' paths')
print(time.ctime())
print()
fx.runLock('off') # remove run lock