1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ """
5
+ @authors: jaydeep thik , Vasudev Purandare
6
+
7
+ """
8
+
9
+ # In[189]:
10
+
11
+
12
+ import pandas as pd
13
+ from youtube_transcript_api import YouTubeTranscriptApi
14
+
15
+ import urllib .request
16
+ from gensim .models import Word2Vec
17
+ import wikipedia
18
+ import nltk
19
+ from nltk .corpus import stopwords
20
+ import bs4
21
+ import requests
22
+
23
+
24
+
25
+ def call_rec (sub , vid_id , seek_time ):
26
+ print ("SEEK_TIME:" + seek_time )
27
+ seek_time = int (seek_time )
28
+ topic = sub .split ()[0 ].lower ()
29
+ #nltk.download('punkt')
30
+ #nltk.download('averaged_perceptron_tagger')
31
+ #nltk.download('stopwords')
32
+ dict = YouTubeTranscriptApi .get_transcript (vid_id ,languages = ['en' ])
33
+ transcript = ''
34
+ for i in range (len (dict )):
35
+ if dict [i ]['start' ]< seek_time :
36
+ transcript = transcript + ' ' + dict [i ]['text' ]
37
+ else :
38
+ break
39
+ print (transcript )
40
+ p = wikipedia .page (sub )
41
+ #print(p.url)
42
+ #print(p.title)
43
+ content = p .content
44
+
45
+ stop_words = set (stopwords .words ('english' ))
46
+ text = content + transcript
47
+ text = ' ' .join ([word .lower () for word in text .split () if word .lower () not in stop_words and len (word )> 2 ])
48
+ #print('the' in text.split())
49
+
50
+ data = []
51
+ from nltk .tokenize import sent_tokenize , word_tokenize
52
+
53
+ # iterate through each sentence in the file
54
+ f = text .replace ("\n " , " " ).replace ("," ,"" ).replace ("(" ,"" ).replace (")" ,"" ).replace (";" ,"" )
55
+
56
+ for i in sent_tokenize (f ):
57
+ temp = []
58
+ # tokenize the sentence into words
59
+ for j in word_tokenize (i ):
60
+ if (j .isalpha () and j .lower () not in stop_words ):
61
+ temp .append (j .lower ())
62
+
63
+ data .append (temp )
64
+
65
+ #print('the' in data)
66
+ # Create CBOW model
67
+ model1 = Word2Vec (data , min_count = 1 ,
68
+ size = 100 , window = 10 )
69
+
70
+ model1 .train (data , total_examples = 1 , epochs = 50 )
71
+
72
+ #print("the" in model1.wv.vocab)
73
+ topic_relevant = []
74
+ for t in model1 .wv .most_similar (topic ):
75
+ topic_relevant .append (t [0 ])
76
+
77
+
78
+ #print(topic_relevant)
79
+ about_topics = ''
80
+ for topics in topic_relevant :
81
+ #print("***"+topics)
82
+ response = requests .get ("https://en.wikipedia.org/wiki/" + topics )
83
+
84
+ about_topics += topics + ' :'
85
+
86
+ if response is not None :
87
+ html = bs4 .BeautifulSoup (response .text , 'html.parser' )
88
+ paragraphs = html .select ("p" )
89
+ #print(wikipedia.page(topics).content)
90
+ for para in paragraphs :
91
+ #print("##########################")
92
+ #print(para.text)
93
+ if len (para .text .split ())> 20 :
94
+ about_topics = about_topics + para .text
95
+ break
96
+ about_topics = about_topics + '\n '
97
+ response .close ();
98
+
99
+ print (topic_relevant )
100
+ return about_topics
101
+ """
102
+ for i in range(len(dict)):
103
+ for w in topic_relevant:
104
+ if w in dict[i]['text'].lower() :
105
+ print (dict[i]['text'])
106
+
107
+ """
0 commit comments