14
14
15
15
from delphi_utils .archive import ArchiveDiffer , GitArchiveDiffer , S3ArchiveDiffer ,\
16
16
archiver_from_params
17
+ from delphi_utils .nancodes import Nans
17
18
18
- CSV_DTYPES = {"geo_id" : str , "val" : float , "se" : float , "sample_size" : float }
19
+ CSV_DTYPES = {
20
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
21
+ "missing_val" : int , "missing_se" :int , "missing_sample_size" : int
22
+ }
19
23
20
24
CSVS_BEFORE = {
21
25
# Common
22
26
"csv0" : pd .DataFrame ({
23
27
"geo_id" : ["1" , "2" , "3" ],
24
28
"val" : [1.000000001 , 2.00000002 , 3.00000003 ],
25
29
"se" : [0.1 , 0.2 , 0.3 ],
26
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
30
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
31
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
32
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
33
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
34
+ }),
27
35
28
36
"csv1" : pd .DataFrame ({
29
37
"geo_id" : ["1" , "2" , "3" ],
30
38
"val" : [1.0 , 2.0 , 3.0 ],
31
39
"se" : [np .nan , 0.20000002 , 0.30000003 ],
32
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
40
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
41
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
42
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
43
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
44
+ }),
33
45
34
46
# Deleted
35
47
"csv2" : pd .DataFrame ({
36
48
"geo_id" : ["1" ],
37
49
"val" : [1.0 ],
38
50
"se" : [0.1 ],
39
- "sample_size" : [10.0 ]}),
51
+ "sample_size" : [10.0 ],
52
+ "missing_val" : [Nans .NOT_MISSING ],
53
+ "missing_se" : [Nans .NOT_MISSING ],
54
+ "missing_sample_size" : [Nans .NOT_MISSING ],
55
+ }),
56
+
57
+ # Common, but updated with missing columns
58
+ "csv4" : pd .DataFrame ({
59
+ "geo_id" : ["1" ],
60
+ "val" : [1.0 ],
61
+ "se" : [0.1 ],
62
+ "sample_size" : [10.0 ]
63
+ }),
64
+
65
+ # Common, but missing columns removed
66
+ "csv5" : pd .DataFrame ({
67
+ "geo_id" : ["1" ],
68
+ "val" : [1.0 ],
69
+ "se" : [0.1 ],
70
+ "sample_size" : [10.0 ],
71
+ "missing_val" : [Nans .NOT_MISSING ],
72
+ "missing_se" : [Nans .NOT_MISSING ],
73
+ "missing_sample_size" : [Nans .NOT_MISSING ],
74
+ }),
40
75
}
41
76
42
77
CSVS_AFTER = {
45
80
"geo_id" : ["1" , "2" , "3" ],
46
81
"val" : [1.0 , 2.0 , 3.0 ],
47
82
"se" : [0.10000001 , 0.20000002 , 0.30000003 ],
48
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
83
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
84
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
85
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
86
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
87
+ }),
49
88
50
89
"csv1" : pd .DataFrame ({
51
90
"geo_id" : ["1" , "2" , "4" ],
52
91
"val" : [1.0 , 2.1 , 4.0 ],
53
92
"se" : [np .nan , 0.21 , np .nan ],
54
- "sample_size" : [10.0 , 21.0 , 40.0 ]}),
93
+ "sample_size" : [10.0 , 21.0 , 40.0 ],
94
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
95
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
96
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
97
+ }),
55
98
56
99
# Added
57
100
"csv3" : pd .DataFrame ({
58
101
"geo_id" : ["2" ],
59
102
"val" : [2.0000002 ],
60
103
"se" : [0.2 ],
61
- "sample_size" : [20.0 ]}),
104
+ "sample_size" : [20.0 ],
105
+ "missing_val" : [Nans .NOT_MISSING ],
106
+ "missing_se" : [Nans .NOT_MISSING ],
107
+ "missing_sample_size" : [Nans .NOT_MISSING ],
108
+ }),
109
+
110
+ # Common, but updated with missing columns
111
+ "csv4" : pd .DataFrame ({
112
+ "geo_id" : ["1" ],
113
+ "val" : [1.0 ],
114
+ "se" : [0.1 ],
115
+ "sample_size" : [10.0 ],
116
+ "missing_val" : [Nans .NOT_MISSING ],
117
+ "missing_se" : [Nans .NOT_MISSING ],
118
+ "missing_sample_size" : [Nans .NOT_MISSING ],
119
+ }),
120
+
121
+ # Common, but missing columns removed
122
+ "csv5" : pd .DataFrame ({
123
+ "geo_id" : ["1" ],
124
+ "val" : [1.0 ],
125
+ "se" : [0.1 ],
126
+ "sample_size" : [10.0 ]
127
+ }),
62
128
}
63
129
64
-
65
130
class TestArchiveDiffer :
66
131
67
132
def test_stubs (self ):
@@ -80,10 +145,14 @@ def test_diff_and_filter_exports(self, tmp_path):
80
145
mkdir (export_dir )
81
146
82
147
csv1_diff = pd .DataFrame ({
83
- "geo_id" : ["2" , "4" ],
84
- "val" : [2.1 , 4.0 ],
85
- "se" : [0.21 , np .nan ],
86
- "sample_size" : [21.0 , 40.0 ]})
148
+ "geo_id" : ["3" , "2" , "4" ],
149
+ "val" : [np .nan , 2.1 , 4.0 ],
150
+ "se" : [np .nan , 0.21 , np .nan ],
151
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
152
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
153
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
154
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
155
+ })
87
156
88
157
arch_diff = ArchiveDiffer (cache_dir , export_dir )
89
158
@@ -106,15 +175,18 @@ def test_diff_and_filter_exports(self, tmp_path):
106
175
# Check return values
107
176
assert set (deleted_files ) == {join (cache_dir , "csv2.csv" )}
108
177
assert set (common_diffs .keys ()) == {
109
- join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" ]}
178
+ join (export_dir , f ) for f in ["csv0.csv" , "csv1.csv" , "csv4.csv" , "csv5.csv" ]}
110
179
assert set (new_files ) == {join (export_dir , "csv3.csv" )}
111
180
assert common_diffs [join (export_dir , "csv0.csv" )] is None
112
181
assert common_diffs [join (export_dir , "csv1.csv" )] == join (
113
182
export_dir , "csv1.csv.diff" )
114
183
115
184
# Check filesystem for actual files
116
185
assert set (listdir (export_dir )) == {
117
- "csv0.csv" , "csv1.csv" , "csv1.csv.diff" , "csv3.csv" }
186
+ "csv0.csv" , "csv1.csv" , "csv1.csv.diff" ,
187
+ "csv3.csv" , "csv4.csv" , "csv4.csv.diff" ,
188
+ "csv5.csv" , "csv5.csv.diff"
189
+ }
118
190
assert_frame_equal (
119
191
pd .read_csv (join (export_dir , "csv1.csv.diff" ), dtype = CSV_DTYPES ),
120
192
csv1_diff )
@@ -132,7 +204,7 @@ def test_diff_and_filter_exports(self, tmp_path):
132
204
arch_diff .filter_exports (common_diffs )
133
205
134
206
# Check exports directory just has incremental changes
135
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
207
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
136
208
assert_frame_equal (
137
209
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
138
210
csv1_diff )
@@ -259,12 +331,16 @@ def test_run(self, tmp_path, s3_client):
259
331
assert_frame_equal (pd .read_csv (body , dtype = CSV_DTYPES ), df )
260
332
261
333
# Check exports directory just has incremental changes
262
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
334
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
263
335
csv1_diff = pd .DataFrame ({
264
- "geo_id" : ["2" , "4" ],
265
- "val" : [2.1 , 4.0 ],
266
- "se" : [0.21 , np .nan ],
267
- "sample_size" : [21.0 , 40.0 ]})
336
+ "geo_id" : ["3" , "2" , "4" ],
337
+ "val" : [np .nan , 2.1 , 4.0 ],
338
+ "se" : [np .nan , 0.21 , np .nan ],
339
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
340
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
341
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
342
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
343
+ })
268
344
assert_frame_equal (
269
345
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
270
346
csv1_diff )
@@ -346,7 +422,11 @@ def test_diff_exports(self, tmp_path):
346
422
"geo_id" : ["1" , "2" , "3" ],
347
423
"val" : [1.0 , 2.0 , 3.0 ],
348
424
"se" : [0.1 , 0.2 , 0.3 ],
349
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
425
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
426
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
427
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
428
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
429
+ })
350
430
351
431
# Write exact same CSV into cache and export, so no diffs expected
352
432
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -383,7 +463,11 @@ def test_archive_exports(self, tmp_path):
383
463
"geo_id" : ["1" , "2" , "3" ],
384
464
"val" : [1.0 , 2.0 , 3.0 ],
385
465
"se" : [0.1 , 0.2 , 0.3 ],
386
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
466
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
467
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
468
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
469
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
470
+ })
387
471
388
472
# csv1.csv is now a dirty edit in the repo, and to be exported too
389
473
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -460,12 +544,16 @@ def test_run(self, tmp_path):
460
544
original_branch .checkout ()
461
545
462
546
# Check exports directory just has incremental changes
463
- assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
547
+ assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" , "csv4.csv" , "csv5.csv" }
464
548
csv1_diff = pd .DataFrame ({
465
- "geo_id" : ["2" , "4" ],
466
- "val" : [2.1 , 4.0 ],
467
- "se" : [0.21 , np .nan ],
468
- "sample_size" : [21.0 , 40.0 ]})
549
+ "geo_id" : ["3" , "2" , "4" ],
550
+ "val" : [np .nan , 2.1 , 4.0 ],
551
+ "se" : [np .nan , 0.21 , np .nan ],
552
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
553
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
554
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
555
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
556
+ })
469
557
assert_frame_equal (
470
558
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
471
559
csv1_diff )
0 commit comments