Skip to content

Commit a0dc0e2

Browse files
Added indeed scraper
1 parent 26cb98b commit a0dc0e2

File tree

2 files changed

+4319
-0
lines changed

2 files changed

+4319
-0
lines changed

indeed-scraper/indeed-scraper.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
''' Written By : Parvez Alam '''
2+
3+
import requests
4+
import bs4
5+
from bs4 import BeautifulSoup
6+
import pandas as pd
7+
import time
8+
9+
max_results_per_city = 100
10+
11+
city_set = ['mumbai','bangalore','hyderabad','pune']
12+
13+
job_title_set = ['full+stack+developer','front+end+developer','back+end+developer','software+engineer','data+scientist','machine+learning+engineer','android+developer','ios+developer']
14+
15+
columns = ["job_title","company_name","summary","city","location","salary","date"]
16+
17+
sample_df = pd.DataFrame(columns = columns)
18+
for job_title in job_title_set:
19+
for city in city_set:
20+
for start in range(0,max_results_per_city,10):
21+
url = "http://www.indeed.co.in/jobs?q=%s&l=%s&start=%s"%(job_title,city,start,)
22+
page = requests.get(url)
23+
print(url)
24+
time.sleep(1)
25+
soup = BeautifulSoup(page.text,"lxml",from_encoding="utf-8")
26+
for div in soup.find_all(name="div",attrs={"class":"row"}):
27+
#specifying row num for index of job posting in dataframe
28+
print(job_title+' '+city+' '+str(start))
29+
print(len(sample_df))
30+
num = (len(sample_df) + 1)
31+
#an empty list to hold the data for each posting
32+
job_post = []
33+
34+
#grabbing job title
35+
try:
36+
for a in div.find_all(name="a",attrs={"data-tn-element":"jobTitle"}):
37+
job_post.append(a["title"])
38+
except:
39+
job_post.append("Not Available")
40+
41+
#grabbing company_name
42+
try:
43+
company = div.find_all(name="span",attrs={"class":"company"})
44+
if len(company) > 0:
45+
for b in company:
46+
job_post.append(b.text.strip())
47+
else:
48+
sec_try = div.find_all(name="span",attrs={"class":"result-link-source"})
49+
for span in sec_try:
50+
job_post.append(span.text)
51+
except:
52+
job_post.append("Not Available")
53+
54+
#grabbing summary text
55+
try:
56+
d = div.findAll('span',attrs={'class':'summary'})
57+
for span in d:
58+
job_post.append(span.text.strip())
59+
except:
60+
job_post.append("Not Available")
61+
62+
#append city name
63+
job_post.append(city)
64+
65+
#grabbing location name
66+
try:
67+
c = div.find_all('span',attrs={'class':'location'})
68+
for span in c:
69+
job_post.append(span.text)
70+
except:
71+
job_post.append("Not Available")
72+
73+
#grabbing salary
74+
try:
75+
#salary = div.find(name="span",attrs={"class":"no-wrap"})
76+
job_post.append(div.find(name="span",attrs={"class":"no-wrap"}).text)
77+
except:
78+
job_post.append("Not Available")
79+
80+
#grabbing salary
81+
try:
82+
#salary = div.find(name="span",attrs={"class":"no-wrap"})
83+
job_post.append(div.find(name="span",attrs={"class":"date"}).text)
84+
except:
85+
job_post.append("Not Available")
86+
87+
#appending list of job post info to dataframe at index num
88+
#print(job_post)
89+
sample_df.loc[num] = job_post
90+
91+
92+
sample_df.to_csv("job_listing.csv",encoding="utf-8")

0 commit comments

Comments
 (0)