-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_parser.py
59 lines (54 loc) · 2.41 KB
/
data_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import string
import ast
import re
import unidecode
from nltk.stem import WordNetLemmatizer
import config
def data_parser(ingreds):
"""
This function takes in a list of ingredients and performs some preprocessing.
Example:
input = '['1 x 1.6kg whole duck', '2 heaped teaspoons Chinese five-spice powder', '1 clementine',
'6 fresh bay leaves', 'GRAVY', '', '1 bulb of garlic', '2 carrots', '2 red onions',
'3 tablespoons plain flour', '100 ml Marsala', '1 litre organic chicken stock']'
output = ['duck', 'chinese five spice powder', 'clementine', 'fresh bay leaf', 'gravy', 'garlic',
'carrot', 'red onion', 'plain flour', 'marsala', 'organic chicken stock']
"""
if isinstance(ingreds, list):
ingredients = ingreds
else:
ingredients = ast.literal_eval(ingreds)
# Remove punctuation.
translator = str.maketrans('', '', string.punctuation)
lemmatizer = WordNetLemmatizer()
ingred_list = []
for i in ingredients:
i.translate(translator)
# Split up with hyphens as well as spaces
items = re.split(' |-', i)
# Get rid of words containing non alphabet letters
items = [word for word in items if word.isalpha()]
# Turn everything to lowercase
items = [word.lower() for word in items]
# Remove accents
items = [unidecode.unidecode(word) for word in
items]
# Lemmatize words so we can compare words to measuring words
items = [lemmatizer.lemmatize(word) for word in items]
# Gets rid of measuring words/phrases, e.g. heaped teaspoon
items = [word for word in items if word not in config.MEASURES]
# Get rid of common easy words
items = [word for word in items if word not in config.REMOVABLE_WORDS]
if items:
ingred_list.append(' '.join(items))
ingred_list = " ".join(ingred_list)
return ingred_list
if __name__ == "__main__":
recipe_df = pd.read_csv(config.RECIPES_PATH)
recipe_df['ingredients_parsed'] = recipe_df['ingredients'].apply(lambda x: data_parser(x))
df = recipe_df[['recipe_name', 'ingredients_parsed', 'ingredients', 'recipe_urls']]
df = recipe_df.dropna()
m = df.recipe_name.str.endswith('Recipe - Allrecipes.com')
df['recipe_name'].loc[m] = df.recipe_name.loc[m].str[:-23]
df.to_csv(config.PARSED_PATH, index=False)