1
+ #Techcrunch Scraper
2
+ from bs4 import BeautifulSoup
3
+ import urllib .request as req
4
+ import pandas as pd
5
+ from selenium import webdriver
6
+ from selenium .webdriver .firefox .options import Options
7
+ import pandas as pd
8
+ import numpy as np
9
+ import csv
10
+ from six .moves import cPickle as pickle
11
+ import numpy as np
12
+
13
+
14
+ def displayPickleData (path_pickle ):
15
+ x = []
16
+ with open (path_pickle ,'rb' ) as f :
17
+ x = pickle .load (f ,encoding = 'iso-8859-15' )
18
+ print (x )
19
+
20
+ def grabTechcrunch ():
21
+ options = Options ()
22
+ options .add_argument ("--headless" )
23
+ driver = webdriver .Firefox (firefox_options = options )
24
+ print ("Firefox Headless Browser Invoked" )
25
+ tc_df = pd .DataFrame (index = np .arange (0 ,), columns = ('Title' ,'Summary' ,'Link' ))
26
+ article_index = 0
27
+ #Iterate through each page
28
+ #Each page fetches approx 20 article links
29
+ try :
30
+ for page_num in range (1 ,5 ):
31
+ url = 'https://techcrunch.com/page/' + str (page_num )
32
+ driver .get (url )
33
+ news_items = driver .find_elements_by_class_name ('river-block' )
34
+ for news_item in news_items :
35
+ post_title = news_item .find_element_by_class_name ('post-title' )
36
+ post_url = news_item .get_attribute ("data-permalink" )
37
+ try :
38
+ data_summary = news_item .find_element_by_class_name ("excerpt" )
39
+ data_entry = [post_title .text ,data_summary .text ,post_url ]
40
+ except Exception as e :
41
+ non_obj = None
42
+ data_entry = [post_title .text ,non_obj ,post_url ]
43
+ tc_df .loc [article_index ]= data_entry
44
+ print (data_entry )
45
+ article_index += 1
46
+ #driver.close()
47
+ print ("Driver Closed" )
48
+ tc_df .to_pickle ('techcrunch.pkl' )
49
+ displayPickleData ('techcrunch.pkl' )
50
+
51
+ except Exception as e :
52
+ driver .close ()
53
+ print ('Error!' + str (e ))
54
+
55
+ def main ():
56
+ grabTechcrunch ()
57
+ #convert('techcrunch.pkl','techcrunch.csv')
58
+
59
+ if __name__ == '__main__' :
60
+ main ()
61
+
62
+
63
+ '''todo
64
+ 1. convert to csv properly
65
+ 2. user input for number of pages
66
+ '''
0 commit comments