-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
66 lines (60 loc) · 2.51 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# from deeppavlov import build_model
from build import *
import sys
from joblib import dump, load
# import dask.dataframe as dd
# import pymongo
# from sklearn.feature_extraction.text import TfidfVectorizer
# # from blaze import TableSymbol, compute
# # documents = TableSymbol('documents', '{title: string, description: string}')
collection = pymongo.MongoClient('mongodb+srv://ir:[email protected]/document?retryWrites=true&w=majority').document.document
# TfidfVectorizer().fit(x['title'] or "" + " " + x['description'] or "" + " " + " ".join(filter(None,x['content']['text'] or [])) for x in collection.find({}, projection={'title': True, 'description': True, "content.text": True}).limit(300))
# # model = build_model('models/squad_torch_bert.json', download=True)
# model("When was James born?","James was born in 1977.")
# model(["James was born in 1977."],["When was James born?"])
# print(list((x.get("title","") + " " + x.get('description',"")+ " " + " ".join(filter(None,x['content']['text'] or [])) for x in collection.find({}, projection={'title': True, 'description': True, "content.text": True}).limit(3))))
# get_data()
# print('hi')
# df = dd.read_parquet("models/documents")
# print(df)
# print("downloaded")
# # l = list(x for x in collection.find().skip(1).limit(1))
# # with open('text.txt', 'w') as f:
# # for item in l[0]['content']['text']:
# # f.write("%s\n" % item.encode('latin1').decode('utf-8'))
# model = load_tfidf_model()
# build_faiss(model, "tfidf")
# Requires the PyMongo package.
# https://api.mongodb.com/python/current
# client = MongoClient('mongodb+srv://ir:[email protected]/test?authSource=admin&replicaSet=atlas-5jw1an-shard-0&readPreference=primary&appname=MongoDB%20Compass&ssl=true')
result = collection.aggregate([
{
'$project': {
'content': {
'$reduce': {
'input': '$content.text',
'initialValue': '',
'in': {
'$concat': [
'$$value', ' ', '$$this'
]
}
}
},
'title': 1,
'description': 1
}
}, {
'$project': {
'text': {
'$concat': [
'$title', ' ', '$description', ' ', '$content'
]
}
}
},
{ "$limit": 1 },
{ "$skip": 0 }
])
# ids = load('models/ids.joblib')
print(list(result))