-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic.py
85 lines (54 loc) · 1.88 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""titanic.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1aQVuC2_03D598qcnhCgEaPUTu_WeZN2-
"""
import pandas as pd
import zipfile
import os
unzipp = '/content/titanic.zip'
extract_dir = '/content/titanic'
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(unzipp, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
os.listdir(extract_dir)
import pandas as pd
train = pd.read_csv('/content/titanic/train.csv')
test = pd.read_csv('/content/titanic/test.csv')
test_ids = test["PassengerId"]
train.head(5)
def clean(train):
train = train.drop(["Ticket","Cabin", "Name", "PassengerId"], axis=1)
cols = ["SibSp","Age", "Fare","Parch"]
for col in cols:
train[col].fillna(train[col].median(), inplace=True)
# Fixed typo here: "Embarked" instead of "Embared"
train.Embarked.fillna("U", inplace=True)
return train
train = clean(train)
test = clean(test)
train.head(3)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
columns = ["Sex", "Embarked"]
for col in columns:
train[col] = le.fit_transform(train[col])
test[col] = le.transform(test[col])
print(le.classes_)
train.head(5)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
y = train["Survived"]
X = train.drop("Survived", axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.datasets import load_iris
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predictions)
submission_preds = clf.predict(test)
df = pd.DataFrame({"PassengerId": test_ids.values,
"Survived": submission_preds,
})
df.to_csv("submission.csv", index=False)