-
Notifications
You must be signed in to change notification settings - Fork 5.7k
/
Copy pathexplore_enron_data.py
executable file
·88 lines (66 loc) · 4.33 KB
/
explore_enron_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python3
"""
Starter code for exploring the Enron dataset (emails + finances);
loads up the dataset (pickled dict of dicts).
The dataset has the form:
enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
{features_dict} is a dictionary of features associated with that person.
You should explore features_dict as part of the mini-project,
but here's an example to get you started:
enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
"""
from tkinter.font import names
import joblib
enron_data = joblib.load(open("./final_project/final_project_dataset.pkl", "rb"))
# Print the first 5 items in the enron_data dictionary
for i, (key, value) in enumerate(enron_data.items()):
if i >= 5:
break
print(f"{key}: {value}\n")
# Print the number of data points (people) in the dataset
print(f"Number of data points: {len(enron_data)}")
# Print the number of features for each person in the dataset
print(f"Number of features: {len(list(enron_data.values())[0])}")
# Count the number of POIs in the dataset
print(f"Number of POIs: {sum([1 for person in enron_data.values() if person['poi']== 1])}") # == 1 or == True or only if person['poi'] without == 1
# form the list of POIs names in /final_project/poi_names.txt and print the number of POIs
poi_names = [name for name in open("./final_project/poi_names.txt").read().split("\n") if name not in ["", "http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm"]]
print(f"Number of POIs: {len(poi_names)}")
print(poi_names)
# List of features for each person in the dataset
first_person_features = list(enron_data.values())[0]
# List of names in the dataset
names_enron_data = list(enron_data.keys())
# Print the total value of the stock belonging to James Prentice
stock_features = [feature for feature in first_person_features.keys() if 'stock' in feature.lower() and 'total' in feature.lower()]
james_prentice_name = [name for name in names_enron_data if 'james' in name.lower() and 'prentice' in name.lower()]
# print(f"Stock features: {stock_features}")
# print(f"Name of the person: {james_prentice_name}")
# Print the total value of the stock belonging to James Prentice
print(f"Total stock value of James Prentice: {enron_data[james_prentice_name[0]][stock_features[0]]}")
# Print the total value of the stock belonging to Wesley Colwell
wesley_colwell_name = [name for name in names_enron_data if 'wesley' in name.lower() and 'colwell' in name.lower()]
# print(f"Name of the person: {wesley_colwell_name}")
# Print Number of emails sent from Wesley Colwell to POIs
print(f"Total Number of emails of Wesley Colwell: {enron_data[wesley_colwell_name[0]]['from_this_person_to_poi']}")
# Print the value of stock options belonging to Jeffrey K Skillin
jeffrey_skillin_name = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'skillin' in name.lower()]
# print(f"Name of the person: {wesley_colwell_name}")
# Print the value of stock options belonging to Jeffrey K Skillin
print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")
# Print the value of total payments to Lay, Skilling and Fastow
# Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()]
Lay_Skilling_Fastow_names = ['LAY KENNETH L', 'FASTOW ANDREW S', 'SKILLING JEFFREY K']
print(f"Name of the person: {Lay_Skilling_Fastow_names}")
total_payments = {name: enron_data[name]['total_payments'] for name in Lay_Skilling_Fastow_names}
# Print the name of the person with the highest total payments
max_total_payments = max(total_payments, key=total_payments.get)
print(f"Name of the person with the highest total payments: {max_total_payments}")
# Print the number of people with a quantified salary
people_with_quantified_salary = [person for person in enron_data.values() if person['salary'] != 'NaN']
people_with_known_emails = [person for person in enron_data.values() if person['email_address'] != 'NaN']
print(f'number of persons with known salarry:', len(people_with_quantified_salary))
print(f'number of persons with known emails:', len(people_with_known_emails))
sys.path.append("./tools/")
from feature_format import featureFormat
enron_data_array = featureFormat(enron_data, first_person_features)