Skip to content

Commit 99687f6

Browse files
authored
Topic recommendation script
1 parent a2fc658 commit 99687f6

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

Rec.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
"""
5+
@authors: jaydeep thik , Vasudev Purandare
6+
7+
"""
8+
9+
# In[189]:
10+
11+
12+
import pandas as pd
13+
from youtube_transcript_api import YouTubeTranscriptApi
14+
15+
import urllib.request
16+
from gensim.models import Word2Vec
17+
import wikipedia
18+
import nltk
19+
from nltk.corpus import stopwords
20+
import bs4
21+
import requests
22+
23+
24+
25+
def call_rec(sub, vid_id, seek_time):
26+
print("SEEK_TIME:"+seek_time)
27+
seek_time = int(seek_time)
28+
topic=sub.split()[0].lower()
29+
#nltk.download('punkt')
30+
#nltk.download('averaged_perceptron_tagger')
31+
#nltk.download('stopwords')
32+
dict=YouTubeTranscriptApi.get_transcript(vid_id,languages=['en'])
33+
transcript=''
34+
for i in range(len(dict)):
35+
if dict[i]['start']<seek_time:
36+
transcript=transcript+' '+dict[i]['text']
37+
else:
38+
break
39+
print(transcript)
40+
p = wikipedia.page(sub)
41+
#print(p.url)
42+
#print(p.title)
43+
content = p.content
44+
45+
stop_words = set(stopwords.words('english'))
46+
text= content + transcript
47+
text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words and len(word)>2])
48+
#print('the' in text.split())
49+
50+
data = []
51+
from nltk.tokenize import sent_tokenize, word_tokenize
52+
53+
# iterate through each sentence in the file
54+
f = text.replace("\n", " ").replace(",","").replace("(","").replace(")","").replace(";","")
55+
56+
for i in sent_tokenize(f):
57+
temp = []
58+
# tokenize the sentence into words
59+
for j in word_tokenize(i):
60+
if(j.isalpha() and j.lower() not in stop_words):
61+
temp.append(j.lower())
62+
63+
data.append(temp)
64+
65+
#print('the' in data)
66+
# Create CBOW model
67+
model1 = Word2Vec(data, min_count = 1,
68+
size = 100, window = 10)
69+
70+
model1.train(data, total_examples=1, epochs=50)
71+
72+
#print("the" in model1.wv.vocab)
73+
topic_relevant=[]
74+
for t in model1.wv.most_similar(topic):
75+
topic_relevant.append(t[0])
76+
77+
78+
#print(topic_relevant)
79+
about_topics=''
80+
for topics in topic_relevant:
81+
#print("***"+topics)
82+
response = requests.get("https://en.wikipedia.org/wiki/"+topics)
83+
84+
about_topics +=topics+' :'
85+
86+
if response is not None:
87+
html = bs4.BeautifulSoup(response.text, 'html.parser')
88+
paragraphs = html.select("p")
89+
#print(wikipedia.page(topics).content)
90+
for para in paragraphs:
91+
#print("##########################")
92+
#print(para.text)
93+
if len(para.text.split())>20:
94+
about_topics=about_topics+para.text
95+
break
96+
about_topics=about_topics+'\n'
97+
response.close();
98+
99+
print(topic_relevant)
100+
return about_topics
101+
"""
102+
for i in range(len(dict)):
103+
for w in topic_relevant:
104+
if w in dict[i]['text'].lower() :
105+
print (dict[i]['text'])
106+
107+
"""

0 commit comments

Comments
 (0)