Skip to content

Commit 0a1020a

Browse files
Added techcrunch scraper
1 parent 7bde6c8 commit 0a1020a

File tree

11 files changed

+66
-0
lines changed

11 files changed

+66
-0
lines changed
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

techcrunch-scraper/geckodriver.exe

5.77 MB
Binary file not shown.

techcrunch-scraper/techcrunch.pkl

40 KB
Binary file not shown.
+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#Techcrunch Scraper
2+
from bs4 import BeautifulSoup
3+
import urllib.request as req
4+
import pandas as pd
5+
from selenium import webdriver
6+
from selenium.webdriver.firefox.options import Options
7+
import pandas as pd
8+
import numpy as np
9+
import csv
10+
from six.moves import cPickle as pickle
11+
import numpy as np
12+
13+
14+
def displayPickleData(path_pickle):
15+
x = []
16+
with open(path_pickle,'rb') as f:
17+
x = pickle.load(f,encoding='iso-8859-15')
18+
print(x)
19+
20+
def grabTechcrunch():
21+
options = Options()
22+
options.add_argument("--headless")
23+
driver = webdriver.Firefox(firefox_options=options)
24+
print("Firefox Headless Browser Invoked")
25+
tc_df = pd.DataFrame(index = np.arange(0,), columns = ('Title','Summary','Link'))
26+
article_index = 0
27+
#Iterate through each page
28+
#Each page fetches approx 20 article links
29+
try:
30+
for page_num in range(1,5):
31+
url = 'https://techcrunch.com/page/'+str(page_num)
32+
driver.get(url)
33+
news_items = driver.find_elements_by_class_name('river-block')
34+
for news_item in news_items:
35+
post_title = news_item.find_element_by_class_name('post-title')
36+
post_url = news_item.get_attribute("data-permalink")
37+
try:
38+
data_summary = news_item.find_element_by_class_name("excerpt")
39+
data_entry = [post_title.text,data_summary.text,post_url]
40+
except Exception as e:
41+
non_obj = None
42+
data_entry = [post_title.text,non_obj,post_url]
43+
tc_df.loc[article_index]= data_entry
44+
print(data_entry)
45+
article_index += 1
46+
#driver.close()
47+
print("Driver Closed")
48+
tc_df.to_pickle('techcrunch.pkl')
49+
displayPickleData('techcrunch.pkl')
50+
51+
except Exception as e:
52+
driver.close()
53+
print('Error!'+str(e))
54+
55+
def main():
56+
grabTechcrunch()
57+
#convert('techcrunch.pkl','techcrunch.csv')
58+
59+
if __name__ == '__main__':
60+
main()
61+
62+
63+
'''todo
64+
1. convert to csv properly
65+
2. user input for number of pages
66+
'''

0 commit comments

Comments
 (0)