-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
82 lines (63 loc) · 2.03 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import List
import pymongo
from pymongo.database import Database
from pymongo import MongoClient
from pymongo.cursor import Cursor
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize # type: ignore
import os
import re
load_dotenv()
def connect_to_db() -> Database:
load_dotenv()
client: MongoClient = pymongo.MongoClient(os.getenv('MONGO_URI'))
db = client.get_default_database()
return db
def clean_readme(readme: str) -> str:
header_pattern = r'^\s*#{1,6}\s+(.*)$'
return re.sub(header_pattern, '', readme, flags=re.MULTILINE)
def split_readme_into_sentences(readme: str) -> List[str]:
cleaned_text = clean_readme(readme)
lines = cleaned_text.split('\n')
sentences = []
for line in lines:
sentences.extend(sent_tokenize(line))
return sentences
def get_mentors() -> Cursor:
db = connect_to_db()
mentors = db.users.find({
"roles": "mentor",
"readme": {"$exists": True},
}, {
"_id": 1,
"readme": 1,
"currentRole": 1,
})
return mentors
def get_mentor_sentences() -> List[dict]:
mentors = get_mentors()
mentor_sentences = []
for mentor in mentors:
sentences = split_readme_into_sentences(mentor.get('readme', ''))
mentor_sentences.extend([{
'sentence': sentence,
'id': mentor['_id'] + '_' + str(index),
'metadata': {
'_id': mentor['_id'],
'field': 'readme',
}}
for index, sentence in enumerate(sentences)
])
if mentor.get('currentRole') is not None:
mentor_sentences.append({
'sentence': mentor['currentRole'],
'id': mentor['_id'] + '_currentRole',
'metadata': {
'_id': mentor['_id'],
'field': 'currentRole',
}
})
return mentor_sentences
if __name__ == '__main__':
sentences = get_mentor_sentences()
print('Found', len(sentences), 'sentences')