Indeed Job Scrape
This is a Scrapy scrape which I wrote for the website indeed to pull out full job description, job title, job location, company, salary and number of days job advertisement has been posted. Using this code and the URL you can simply call it from the terminal.
import scrapy
from scrapy import Request
class JobsSpider(scrapy.Spider):
name = 'jobs'
allowed_domains = ['indeed.com']
start_urls = ['https://au.indeed.com/jobs?q=data+analyst&l=sydney&radius=15',
'https://au.indeed.com/jobs?q=data+engineer&l=sydney&radius=15',
'https://au.indeed.com/jobs?q=data+scientist&l=sydney'
]
BASE_URL = 'https://au.indeed.com/'
def parse(self, response):
links = response.xpath('//a[@data-tn-element="jobTitle"]/@href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_page)
relative_next_url = response.xpath('//*[@class="pagination"]/a/@href')[-1].extract()
absolute_next_url = self.BASE_URL + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
title = response.xpath('//b[@class="jobtitle"]/font/text()[normalize-space()]').extract_first()
company = response.xpath('//*[@class="company"][1]/text()[normalize-space()]').extract_first()
location = response.xpath('//*[@class="location"][1]/text()[normalize-space()]').extract_first()
salary_heading = response.xpath('//*[@class="no-wrap"]/text()[normalize-space()]').extract_first()
job_sum = " ".join(line for line in response.xpath('//*[@class="summary"]/descendant-or-self::*/text()[normalize-space()]').extract())
days_posted = response.xpath('//div[@class="result-link-bar"]/span[@class="date"]/text()[normalize-space()]').extract_first()
yield{'Title': title, 'Company': company, 'Location':location, 'Salary_heading':salary_heading,'Job_sum_complete':job_sum,'Days_posted':days_posted}
Written on August 8, 2018