-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolution.py
69 lines (54 loc) · 2.51 KB
/
solution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import random
# Simulated feature extraction from images (you can replace this with actual image processing)
def extract_image_features(image_link):
# In a real scenario, you would download the image and extract features using a CNN
return [random.random() for _ in range(10)] # Dummy feature vector
# Function to prepare the training dataset
def prepare_data(train_df):
X = []
y = []
for _, row in train_df.iterrows():
image_features = extract_image_features(row['image_link'])
X.append(image_features + [row['group_id'], row['entity_name']])
y.append(row['entity_value'].split()[0]) # Extract the numerical value from the entity_value
return X, y
# Model training
def train_model(X, y):
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
return model
# Prediction function
def predictor(model, image_link, group_id, entity_name):
image_features = extract_image_features(image_link)
input_data = image_features + [group_id, entity_name]
# Make a prediction using the trained model
prediction = model.predict([input_data])[0]
# Format the prediction (this part can be improved by handling unit prediction better)
unit = "inch" # Assuming we predict a unit like "inch" for now (improve this for real-world cases)
return f"{prediction:.2f} {unit}"
# Main function to generate predictions for test data
def generate_predictions(model, test_df):
test_df['prediction'] = test_df.apply(
lambda row: predictor(model, row['image_link'], row['group_id'], row['entity_name']), axis=1)
return test_df
if __name__ == "__main__":
# Load the dataset
DATASET_FOLDER = '../dataset/'
train_df = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
# Prepare training data
X, y = prepare_data(train_df)
# Train the model
model = train_model(X, y)
# Generate predictions for the test set
test_df = generate_predictions(model, test_df)
# Save the predictions to a CSV file
output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
test_df[['index', 'prediction']].to_csv(output_filename, index=False)
print(f"Predictions saved to {output_filename}")