-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_contents.py
executable file
·76 lines (69 loc) · 3.68 KB
/
dataset_contents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/local/cdat/bin/python
""" get the status of datasets and of their files, from the esgcet database.
The main purpose is to identify those datases which can be completed by
downloading just a few more files."""
# This is straightforward but really slow...
# Usage:
# 1. export PYTHONPATH=$PYTHONPATH:/export/home/painter/src/esgf-contrib/estani/python/
# 2. Set the hard-coded inputs just below. Review the code, as it is intended to be
# changed as needed.
# 3. If you chose to make download lists, concatenate them and edit them as needed.
# Here are the most important inputs. Change them here:
like = "'%GFDL%'" # the "LIKE" part of the query to identify datasets to work on
make_dl_lists = True # Set to True to make download lists, which is useful, but very much slower
# and confuses the output. Otherwise set to False.
# imports copied from harvest_cmip5.py, maybe not all needed...
import sqlalchemy
import os,sys
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Float, String, Boolean, DateTime, sql, ForeignKeyConstraint, orm
import time, datetime
# imports added as needed...
from esgcet.config import loadConfig
sql_ds20 = "SELECT name FROM replica.datasets WHERE name LIKE "+like+" AND status=20;"
sql_nfa1 = "SELECT COUNT(*) FROM replica.files WHERE dataset_name='"
config = loadConfig(None)
engine = sqlalchemy.create_engine(config.get('replication', 'esgcet_db'), echo=False, pool_recycle=3600)
datasets = engine.execute( sql.text( sql_ds20 ) ).fetchall()
fmt = "%72s %12s %12s %12s %12s %12s %12s"
print fmt % ("dataset","files","status 0/10","status 20","status 30","status 100","status -1")
print fmt % (" ", " ", "no attempt","chose to dl","downloaded","verified","error")
for ds in datasets:
dstr = ds[0]
nfiles = engine.execute(sql.text( sql_nfa1+ds[0]+"';" ) ).fetchall()[0][0]
nfiles010 = engine.execute(sql.text( sql_nfa1+ds[0]+"' AND status>=0 AND status<=10;" ) ).fetchall()[0][0]
nfiles20 = engine.execute(sql.text( sql_nfa1+ds[0]+"' AND status=20;" ) ).fetchall()[0][0]
nfiles30 = engine.execute(sql.text( sql_nfa1+ds[0]+"' AND status=30;" ) ).fetchall()[0][0]
nfiles100 = engine.execute(sql.text( sql_nfa1+ds[0]+"' AND status=100;" ) ).fetchall()[0][0]
nfiles_m1 = engine.execute(sql.text( sql_nfa1+ds[0]+"' AND status=-1;" ) ).fetchall()[0][0]
print fmt % (dstr,nfiles,nfiles010,nfiles20,nfiles30,nfiles100,nfiles_m1)
if make_dl_lists is False:
continue
if nfiles100<nfiles and nfiles100>0.5*nfiles:
if dstr.find("mon.atmos")>0:
if dstr.find(".historical.")>0 or dstr.find(".rcp85.")>0 or dstr.find(".rcp45.")>0\
or dstr.find(".piControl.")>0:
dpri = 10
print ' '*72,"...urgent download candidate"
else:
dpri = 8
print ' '*72,"...excellent download candidate"
else:
dpri = 6
print ' '*72,"...good download candidate"
elif nfiles100<nfiles:
dpri = 2
else:
dpri = 0
if dpri>2:
# Create a download list for just this dataset.
import replica_manager
replica_manager.dataset_match = dstr
# Use time to make the output filename unique.
tt=time.localtime().__reduce__()[1][0]
ts = '.'.join(str(i) for i in tt)
outfile = "download_"+dstr+'_'+ts
print "creating download list",outfile
dataset_type = 'list.repo.pcmdi' # You have to 'know' this one! pcmdi is most common
replica_manager.create_download_lists( outfile, dataset_type )
# Probably the gsiftp fields of the download list will have to be edited to http-something.