-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_diff.py
94 lines (80 loc) · 3.19 KB
/
data_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import requests
import json
from hashlib import sha256
from data_extractor import modify
import pandas as pd
import io
from copy import deepcopy
def csv_data2df(csv_data):
csv_data_file_obj = io.StringIO(csv_data)
df = pd.read_csv(csv_data_file_obj)
return df
def df2list(df):
data = []
columns = list(df.keys().values)
for i in range(len(df)):
new_dict = {}
for col in columns:
new_dict[col] = df[col][i] if not(pd.isna(df[col][i])) else None
new_dict['row_num'] = i+1
data.append(new_dict)
del new_dict
return data
def get_data(data_source, filenames=None):
if(data_source=="coronasafe"):
data = []
for filename in filenames:
r = requests.get("https://life-api.coronasafe.network/data/{}.json".format(filename))
data += r.json()["data"]
elif(data_source=="nlp_supply"):
r = requests.get("https://docs.google.com/spreadsheets/d/1z8quqkyMubvIJDY6mRVugZeecgY__dH8I-ZmS0_FL14/export?format=csv&id=1z8quqkyMubvIJDY6mRVugZeecgY__dH8I-ZmS0_FL14&gid=0")
csv_data = r.content.decode('utf-8')
df = csv_data2df(csv_data)
data = df2list(df)
elif(data_source=="nlp_demand"):
r = requests.get("https://docs.google.com/spreadsheets/d/1TwD2-0vjloYurPHa86-PDd8rQSsVrRn7YtB90FXDgJw/export?format=csv&id=1TwD2-0vjloYurPHa86-PDd8rQSsVrRn7YtB90FXDgJw&gid=0")
csv_data = r.content.decode('utf-8')
df = csv_data2df(csv_data)
data = df2list(df)
return data
def hash_data(data):
hashed = []
for d in data:
hashed.append(sha256(str(d).encode('utf-8')).hexdigest())
return hashed
def save_to_file(data, data_source):
extracted_file_obj = open("{}_extracted.json".format(data_source),"w")
json_data = {"data":data}
json.dump(json_data,extracted_file_obj,indent=4,default=str)
extracted_file_obj.close()
def get_extracted_data(data_source):
extracted_file_obj = open("{}_extracted.json".format(data_source),"r")
extracted_data = json.load(extracted_file_obj)["data"]
extracted_file_obj.close()
return extracted_data
def get_diff(data_source, filenames=None):
new_data = []
data = get_data(data_source, filenames)
data = modify(data, data_source)
data_copy = deepcopy(data)
# convert timestamp objects to string so their hash is comparable with json input hash
for entry in data_copy:
if(entry.get("last_verified_on", "")):
entry['last_verified_on'] = str(entry['last_verified_on'])
if(entry.get("created_on", "")):
entry['created_on'] = str(entry['created_on'])
# read archived data, Get all the entries that are already present in the database
extracted_data = get_extracted_data(data_source)
# hash archived data
hashed_data = hash_data(extracted_data)
# compare hashes to get new data
for d, orig in zip(data_copy, data):
d_hashed = sha256(str(d).encode('utf-8')).hexdigest()
if(d_hashed not in hashed_data):
new_data.append(orig)
# add new data to archived data
extracted_data += new_data
# write new archived data
save_to_file(extracted_data, data_source)
return new_data