Module event_processor.scrapers.chihacknight_crawl_spider

Expand source code
# -*- coding: utf-8 -*-
from event_processor.base.custom_spiders import ScraperCrawlSpider
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

class ChiHackNightCrawlSpider(ScraperCrawlSpider):
    name = 'chihacknightcrawl'
    allowed_domains = ['chihacknight.org']
    start_urls = ['https://chihacknight.org/events']
    enabled = True

    rules = (
        Rule(LinkExtractor(restrict_css = 'table tr td:nth-child(3) a'), callback="parse_page", follow=True),
    ) 
 
    def __init__(self, name=None, **kwargs):
        super().__init__(self, 'Chi Hack Night', 'https://chihacknight.org', date_format='%B %d, %Y', **kwargs)
    
    def parse_page(self, response): 
        return {
            'title': self.empty_check_extract(response.css('#primary-content'), self.css_func, ' [itemprop="name"]::text'),
            'url': list(map(lambda x: response.url, self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="name"]::text'))), 
            'event_time': self.create_time_data(
                date=self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="startDate"]::text')
            ),
            'address': self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="address"] *::text', default_value="222 Merchandise Mart Plaza, Chicago, IL 60654"),
            'description': self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="description"] *::text')
        }

Classes

class ChiHackNightCrawlSpider (name=None, **kwargs)

??? Base spider for reading websites that may require further crawling and multiple page loads

Expand source code
class ChiHackNightCrawlSpider(ScraperCrawlSpider):
    name = 'chihacknightcrawl'
    allowed_domains = ['chihacknight.org']
    start_urls = ['https://chihacknight.org/events']
    enabled = True

    rules = (
        Rule(LinkExtractor(restrict_css = 'table tr td:nth-child(3) a'), callback="parse_page", follow=True),
    ) 
 
    def __init__(self, name=None, **kwargs):
        super().__init__(self, 'Chi Hack Night', 'https://chihacknight.org', date_format='%B %d, %Y', **kwargs)
    
    def parse_page(self, response): 
        return {
            'title': self.empty_check_extract(response.css('#primary-content'), self.css_func, ' [itemprop="name"]::text'),
            'url': list(map(lambda x: response.url, self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="name"]::text'))), 
            'event_time': self.create_time_data(
                date=self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="startDate"]::text')
            ),
            'address': self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="address"] *::text', default_value="222 Merchandise Mart Plaza, Chicago, IL 60654"),
            'description': self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="description"] *::text')
        }

Ancestors

Class variables

var allowed_domains

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

var rules

Built-in immutable sequence.

If no argument is given, the constructor returns an empty tuple. If iterable is specified the tuple is initialized from iterable's items.

If the argument is a tuple, the return value is the same object.

var start_urls

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

Methods

def parse_page(self, response)
Expand source code
def parse_page(self, response): 
    return {
        'title': self.empty_check_extract(response.css('#primary-content'), self.css_func, ' [itemprop="name"]::text'),
        'url': list(map(lambda x: response.url, self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="name"]::text'))), 
        'event_time': self.create_time_data(
            date=self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="startDate"]::text')
        ),
        'address': self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="address"] *::text', default_value="222 Merchandise Mart Plaza, Chicago, IL 60654"),
        'description': self.empty_check_extract(response.css('#primary-content'), self.css_func, '[itemprop="description"] *::text')
    }

Inherited members