Module `event_processor.scrapers.history_spider`

Expand source code

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Rule
from event_processor.base.custom_spiders import ScraperCrawlSpider
from scrapy.linkextractors import LinkExtractor

from event_processor.util.data_utils import DataUtils

class HistorySpider(ScraperCrawlSpider):
    name = 'history'
    allowed_domains = ['www.chicagohistory.org']
    enabled = False

    rules = (
        Rule(LinkExtractor(restrict_css = '.title'), process_request = 'link_request', callback = 'parse_item'),
    )

    def __init__(self, name=None, **kwargs):
        super().__init__(self, 'Chicago History Museum', 'https://www.chicagohistory.org/', date_format = '%d %B %Y', request_date_format = '%Y%m%d', **kwargs)

    def start_requests(self):
        yield self.get_request('events', {
                'start_date': self.start_date,
                'end_date': self.end_date
            })


    def parse_start_url(self, response):
        def get_full_date(xpath_result):
            result = []
            current_month = ''
            for text in xpath_result:
                text = DataUtils.remove_html(text)
                # Month names are all greater than 2 characters
                # Days of the month are all 2 characters or fewer
                if len(text) > 2:
                    current_month = text
                else:
                    result.append(f'{text} {current_month}')
            return result

        return {
            'title': response.css('a.title::text').extract(),
            'url': response.css('a.title::attr(href)').extract(),
            'event_time': self.create_time_data(
                time_range=self.empty_check_extract(response.css('.details'), self.css_func, '.time::text'),
                date=get_full_date(response.css('.xcalendar-row .number,.month').extract())
            ),
            'description': response.css('.info').extract()
        }

    def link_request(self, request, response):
        # Store the original url in case it gets redirected later
        request.meta['clicked_url'] = request.url
        return request

    def parse_item(self, response):
        prices = response.css('.price').extract()
        addresses = response.xpath('//h3[contains(text(), "Event Location")]/following-sibling::div/p').extract()
        address = ''
        if len(addresses) == 0:
            self.logger.warning(f'no address found for {response.url}')
        else:
            address = addresses[0]
        return {
            'url': [response.meta['clicked_url']],
            'address': [address],
            'price': [prices[0] if len(prices) > 0 else '0']
        }

Classes

class HistorySpider (name=None, **kwargs)

??? Base spider for reading websites that may require further crawling and multiple page loads

Expand source code

class HistorySpider(ScraperCrawlSpider):
    name = 'history'
    allowed_domains = ['www.chicagohistory.org']
    enabled = False

    rules = (
        Rule(LinkExtractor(restrict_css = '.title'), process_request = 'link_request', callback = 'parse_item'),
    )

    def __init__(self, name=None, **kwargs):
        super().__init__(self, 'Chicago History Museum', 'https://www.chicagohistory.org/', date_format = '%d %B %Y', request_date_format = '%Y%m%d', **kwargs)

    def start_requests(self):
        yield self.get_request('events', {
                'start_date': self.start_date,
                'end_date': self.end_date
            })


    def parse_start_url(self, response):
        def get_full_date(xpath_result):
            result = []
            current_month = ''
            for text in xpath_result:
                text = DataUtils.remove_html(text)
                # Month names are all greater than 2 characters
                # Days of the month are all 2 characters or fewer
                if len(text) > 2:
                    current_month = text
                else:
                    result.append(f'{text} {current_month}')
            return result

        return {
            'title': response.css('a.title::text').extract(),
            'url': response.css('a.title::attr(href)').extract(),
            'event_time': self.create_time_data(
                time_range=self.empty_check_extract(response.css('.details'), self.css_func, '.time::text'),
                date=get_full_date(response.css('.xcalendar-row .number,.month').extract())
            ),
            'description': response.css('.info').extract()
        }

    def link_request(self, request, response):
        # Store the original url in case it gets redirected later
        request.meta['clicked_url'] = request.url
        return request

    def parse_item(self, response):
        prices = response.css('.price').extract()
        addresses = response.xpath('//h3[contains(text(), "Event Location")]/following-sibling::div/p').extract()
        address = ''
        if len(addresses) == 0:
            self.logger.warning(f'no address found for {response.url}')
        else:
            address = addresses[0]
        return {
            'url': [response.meta['clicked_url']],
            'address': [address],
            'price': [prices[0] if len(prices) > 0 else '0']
        }

Ancestors

ScraperCrawlSpider
scrapy.spiders.crawl.CrawlSpider
scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
SpiderBase
AggregatorBase

Class variables

var allowed_domains: Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.
var rules: Built-in immutable sequence.

If no argument is given, the constructor returns an empty tuple. If iterable is specified the tuple is initialized from iterable's items.

If the argument is a tuple, the return value is the same object.

Methods

def link_request(self, request, response)

Expand source code

def link_request(self, request, response):
    # Store the original url in case it gets redirected later
    request.meta['clicked_url'] = request.url
    return request

def parse_item(self, response)

Expand source code

def parse_item(self, response):
    prices = response.css('.price').extract()
    addresses = response.xpath('//h3[contains(text(), "Event Location")]/following-sibling::div/p').extract()
    address = ''
    if len(addresses) == 0:
        self.logger.warning(f'no address found for {response.url}')
    else:
        address = addresses[0]
    return {
        'url': [response.meta['clicked_url']],
        'address': [address],
        'price': [prices[0] if len(prices) > 0 else '0']
    }

def parse_start_url(self, response)

Expand source code

def parse_start_url(self, response):
    def get_full_date(xpath_result):
        result = []
        current_month = ''
        for text in xpath_result:
            text = DataUtils.remove_html(text)
            # Month names are all greater than 2 characters
            # Days of the month are all 2 characters or fewer
            if len(text) > 2:
                current_month = text
            else:
                result.append(f'{text} {current_month}')
        return result

    return {
        'title': response.css('a.title::text').extract(),
        'url': response.css('a.title::attr(href)').extract(),
        'event_time': self.create_time_data(
            time_range=self.empty_check_extract(response.css('.details'), self.css_func, '.time::text'),
            date=get_full_date(response.css('.xcalendar-row .number,.month').extract())
        ),
        'description': response.css('.info').extract()
    }

def start_requests(self)

Expand source code

def start_requests(self):
    yield self.get_request('events', {
            'start_date': self.start_date,
            'end_date': self.end_date
        })

Inherited members

ScraperCrawlSpider:
- custom_settings
- empty_check_extract
- enabled
- get_request
- item_filter
- name