forked from pclubiitk/model-zoo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataloader.py
43 lines (30 loc) · 1.15 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# -*- coding: utf-8 -*-
"""dataloader.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/10JjHEWNTDClqlbw5d1vzctN1Zsewo-dX
"""
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# Downloading the dataset
# !kaggle datasets download -d harmanpreet93/hotelreviews
# unzip the dataset and keep it in a folder named hotelreviews
def tokenizeData(indv_lines):
review_data_list = list()
for line in indv_lines:
tokenizer = RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(line)
words = [word.lower() for word in tokens]
#stop_word_list = set(stopwords.words('english'))
#words = [w for w in words if not w in stop_word_list]
review_data_list.append(words)
return review_data_list
def tokenized_dataLoader():
hotel_data = pd.read_csv('~/hotelreviews/hotel-reviews.csv')
hotel_data = hotel_data['Description'].tolist()
hotel_data = hotel_data[0:100]#you can increase the upper limit depending on your ram size
indv_lines = hotel_data
return tokenizeData(indv_lines)