20
20
supported_formats = ['.json' ]
21
21
22
22
23
- def load_PDF_into_db (dbname , pdfname , hddata : dict , rv : list , show_path = True ):
24
- """Load an entry consisting of PDF header and base data into a database file.
25
-
26
- Requires hdata and rv generated from loadData.
27
-
28
- dbname -- name of the database file to load an entry into.
29
- pdfname -- name of the PDF file.
30
- hddata -- Dictionary of PDF metadata generated by loadData.
31
- rv -- List of PDF (r, gr) pairs generated by loadData.
32
- show_path -- include a PDF_path element in the database entry (default True).
23
+ def serialize_data (filename , hdata : dict , data_table : list , show_path = True , dt_colnames = None , serial_file = None ):
24
+ """Serialize file data into a dictionary. Can also save dictionary into a serial language file.
25
+ Dictionary is formatted as {filename: data}.
26
+
27
+ Requires hdata and data_table generated from loadData.
28
+
29
+ filename -- name of the file whose data is being serialized.
30
+ hdata -- Dictionary of PDF metadata generated by loadData.
31
+ data_table -- List storing parsed by loadData.
32
+ dt_colnames -- List containing names of each column in data_table. Every name in
33
+ data_table_cols will be put into the Dictionary as a key with a value
34
+ of that column in data_table (stored as a List). Put None for
35
+ columns without names. If dt_cols has less non-None entries
36
+ than columns in data_table, the pair {'data table': data_table} will be put
37
+ in the dictionary. (Default None: only entry {'data table': data_table}
38
+ will be added to dictionary.)
39
+ show_path -- include a path element in the database entry (default True).
40
+ If 'path' is not included in hddata, extract path from filename.
41
+ serial_file -- serial language file to dump dictionary into.
33
42
34
43
Returns the dictionary loaded from/into the updated database file.
35
44
"""
36
- # new file or update
37
- existing = False
38
- if pathlib .Path .is_file (dbname ):
39
- existing = True
40
-
41
- # collect entry
42
- with open (pdfname , 'r' ) as grfile :
43
- data = {}
44
-
45
- # add path
46
- grpath = grfile .name
47
- if show_path :
48
- data .update ({'PDF_path' : grpath })
49
45
50
- # add r, gr, and header metadata
51
- data .update ({'r' : list (rv [:, 0 ]), 'gr' : list (rv [:, 1 ])})
52
- data .update (hddata )
53
-
54
- # parse name using pathlib and generate json entry
55
- name = pathlib .Path (grpath ).name
56
- entry = {name : data }
46
+ # compile data_table and hddata together
47
+ data = {}
57
48
49
+ # handle getting name of file for variety of filename types
50
+ with open (filename , 'r' ) as file_path :
51
+ abs_path = pathlib .Path (file_path .name ).resolve ()
52
+ # add path to start of data if requested
53
+ if show_path and 'path' not in hdata .keys ():
54
+ data .update ({'path' : abs_path .name })
55
+ # title the entry with name of file (taken from end of path)
56
+ title = abs_path .name
57
+
58
+ # first add named columns in dt_cols
59
+ num_columns = [len (row ) for row in data_table ]
60
+ max_columns = max (num_columns )
61
+ num_col_names = len (dt_colnames )
62
+ if max_columns < num_col_names : # assume numpy.loadtxt gives non-irregular array
63
+ raise Exception ("More entries in dt_colnames than columns in data_table." )
64
+ named_columns = 0
65
+ for idx in range (num_col_names ):
66
+ colname = dt_colnames [idx ]
67
+ if colname is not None :
68
+ data .update ({colname : list (data_table [:, idx ])})
69
+ named_columns += 1
70
+
71
+ # second add data in hddata dict
72
+ data .update (hdata )
73
+
74
+ # finally add data_table as an entry named 'data table' if not all columns were parsed
75
+ if named_columns < max_columns :
76
+ if 'data table' not in data .keys ():
77
+ data .update ({'data table' : data_table })
78
+ else : # if 'data table' is already a key, keep adding primes to the end
79
+ dt_name = 'data table'
80
+ while dt_name in data .keys ():
81
+ dt_name += " prime"
82
+ data .update ({dt_name : data_table })
83
+
84
+ # parse name using pathlib and generate dictionary entry
85
+ entry = {title : data }
86
+
87
+ # no save
88
+ if serial_file is None :
89
+ return entry
90
+
91
+ # saving/updating file
58
92
# check if supported type
59
- extension = pathlib .Path (dbname ).suffix
93
+ extension = pathlib .Path (serial_file ).suffix
60
94
if extension not in supported_formats :
61
- raise Exception (f"Format of { dbname } is not supported." )
95
+ raise Exception (f"Format of { serial_file } is not supported." )
96
+
97
+ # new file or update
98
+ existing = False
99
+ try :
100
+ open (serial_file )
101
+ existing = True
102
+ except FileNotFoundError :
103
+ pass
62
104
63
105
# json
64
106
if extension == '.json' :
65
107
# dump if non-existing
66
108
if not existing :
67
- with open (dbname , 'w' ) as jsonfile :
68
- pdfs = entry # for return
69
- json .dump (pdfs , jsonfile , indent = 2 )
109
+ with open (serial_file , 'w' ) as jsonfile :
110
+ file_data = entry # for return
111
+ json .dump (file_data , jsonfile , indent = 2 )
70
112
71
113
# update if existing
72
114
else :
73
- with open (dbname , 'r' ) as json_read :
74
- pdfs = json .load (json_read )
75
- pdfs .update (entry )
76
- with open (dbname , 'w' ) as json_write :
115
+ with open (serial_file , 'r' ) as json_read :
116
+ file_data = json .load (json_read )
117
+ file_data .update (entry )
118
+ with open (serial_file , 'w' ) as json_write :
77
119
# dump to string first for formatting
78
- json .dump (pdfs , json_write , indent = 2 )
120
+ json .dump (file_data , json_write , indent = 2 )
79
121
80
- return pdfs
122
+ return file_data
81
123
82
124
83
- def load_from_db (filename ):
84
- """Load a dictionary from a database file.
125
+ def deserialize_data (filename ):
126
+ """Load a dictionary from a serial file.
85
127
86
128
filename -- database file to load from.
87
129
@@ -101,40 +143,7 @@ def load_from_db(filename):
101
143
return j_dict
102
144
103
145
104
- def markup_PDF (hddata : dict , rv : list , muname = None ):
105
- # FIXME: may be better suited for REST API package, not diffpy.utils
106
- """Put PDF file information into a dictionary.
107
-
108
- hddata -- Dictionary of metadata.
109
- rv -- List of (r, gr) pairs.
110
- muname -- file to save into (default None, no saving occurs).
111
-
112
- Returns the dictionary loaded from/into markup file.
113
- """
114
-
115
- # gather data
116
- data = {}
117
- data .update ({'r' : list (rv [:, 0 ]), 'gr' : list (rv [:, 1 ])})
118
- data .update (hddata )
119
-
120
- # return directly
121
- if muname is None :
122
- return data
123
-
124
- # save to disk when enabled
125
- extension = pathlib .Path (muname ).suffix
126
- if extension not in supported_formats :
127
- raise Exception (f"Format of { muname } is not supported." )
128
-
129
- # dumps into file, automatically overwrites
130
- if extension == '.json' :
131
- with open (muname , 'w' ) as json_write :
132
- json .dump (data , json_write , indent = 2 )
133
-
134
- return data
135
-
136
-
137
- def markup_oneline (filename ):
146
+ def serial_oneline (filename ):
138
147
"""Reformat lists in markup languages to take up only one line.
139
148
140
149
Works well when only lists are surrounded by square brackets and no other data is comma and newline separated.
0 commit comments