|
| 1 | +''' Written By : Parvez Alam ''' |
| 2 | + |
| 3 | +import requests |
| 4 | +import bs4 |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import pandas as pd |
| 7 | +import time |
| 8 | + |
| 9 | +max_results_per_city = 100 |
| 10 | + |
| 11 | +city_set = ['mumbai','bangalore','hyderabad','pune'] |
| 12 | + |
| 13 | +job_title_set = ['full+stack+developer','front+end+developer','back+end+developer','software+engineer','data+scientist','machine+learning+engineer','android+developer','ios+developer'] |
| 14 | + |
| 15 | +columns = ["job_title","company_name","summary","city","location","salary","date"] |
| 16 | + |
| 17 | +sample_df = pd.DataFrame(columns = columns) |
| 18 | +for job_title in job_title_set: |
| 19 | + for city in city_set: |
| 20 | + for start in range(0,max_results_per_city,10): |
| 21 | + url = "http://www.indeed.co.in/jobs?q=%s&l=%s&start=%s"%(job_title,city,start,) |
| 22 | + page = requests.get(url) |
| 23 | + print(url) |
| 24 | + time.sleep(1) |
| 25 | + soup = BeautifulSoup(page.text,"lxml",from_encoding="utf-8") |
| 26 | + for div in soup.find_all(name="div",attrs={"class":"row"}): |
| 27 | + #specifying row num for index of job posting in dataframe |
| 28 | + print(job_title+' '+city+' '+str(start)) |
| 29 | + print(len(sample_df)) |
| 30 | + num = (len(sample_df) + 1) |
| 31 | + #an empty list to hold the data for each posting |
| 32 | + job_post = [] |
| 33 | + |
| 34 | + #grabbing job title |
| 35 | + try: |
| 36 | + for a in div.find_all(name="a",attrs={"data-tn-element":"jobTitle"}): |
| 37 | + job_post.append(a["title"]) |
| 38 | + except: |
| 39 | + job_post.append("Not Available") |
| 40 | + |
| 41 | + #grabbing company_name |
| 42 | + try: |
| 43 | + company = div.find_all(name="span",attrs={"class":"company"}) |
| 44 | + if len(company) > 0: |
| 45 | + for b in company: |
| 46 | + job_post.append(b.text.strip()) |
| 47 | + else: |
| 48 | + sec_try = div.find_all(name="span",attrs={"class":"result-link-source"}) |
| 49 | + for span in sec_try: |
| 50 | + job_post.append(span.text) |
| 51 | + except: |
| 52 | + job_post.append("Not Available") |
| 53 | + |
| 54 | + #grabbing summary text |
| 55 | + try: |
| 56 | + d = div.findAll('span',attrs={'class':'summary'}) |
| 57 | + for span in d: |
| 58 | + job_post.append(span.text.strip()) |
| 59 | + except: |
| 60 | + job_post.append("Not Available") |
| 61 | + |
| 62 | + #append city name |
| 63 | + job_post.append(city) |
| 64 | + |
| 65 | + #grabbing location name |
| 66 | + try: |
| 67 | + c = div.find_all('span',attrs={'class':'location'}) |
| 68 | + for span in c: |
| 69 | + job_post.append(span.text) |
| 70 | + except: |
| 71 | + job_post.append("Not Available") |
| 72 | + |
| 73 | + #grabbing salary |
| 74 | + try: |
| 75 | + #salary = div.find(name="span",attrs={"class":"no-wrap"}) |
| 76 | + job_post.append(div.find(name="span",attrs={"class":"no-wrap"}).text) |
| 77 | + except: |
| 78 | + job_post.append("Not Available") |
| 79 | + |
| 80 | + #grabbing salary |
| 81 | + try: |
| 82 | + #salary = div.find(name="span",attrs={"class":"no-wrap"}) |
| 83 | + job_post.append(div.find(name="span",attrs={"class":"date"}).text) |
| 84 | + except: |
| 85 | + job_post.append("Not Available") |
| 86 | + |
| 87 | + #appending list of job post info to dataframe at index num |
| 88 | + #print(job_post) |
| 89 | + sample_df.loc[num] = job_post |
| 90 | + |
| 91 | + |
| 92 | +sample_df.to_csv("job_listing.csv",encoding="utf-8") |
0 commit comments