Module `event_processor.scrapers.chihacknight_simple_spider`

Expand source code

# -*- coding: utf-8 -*-
from event_processor.base.custom_spiders import ScraperSpider

class ChiHackNightSpider(ScraperSpider): 
    name = 'chihacknight'
    allowed_domains = ['chihacknight.org']
    enabled = False 

    def __init__(self, name=None, **kwargs):
        super().__init__(self, 'Chi Hack Night', 'https://chihacknight.org/', date_format='%b %d, %Y', **kwargs)

    def start_requests(self): 
        yield self.get_request('events/', {})
    
    def parse(self, response): 
        return { 
            'title': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(3) span::text'),
            'url': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(3) a::attr(href)'),
            #'event_time': 'January 1, 2022', 
            'event_time': self.create_time_data(
                date=self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(1) p::text', 'Jan 01, 2012')
            ),
            'address': list(map(lambda x: '222 Merchandise Mart Plaza, Chicago, IL 60654', self.empty_check_extract(response.css('table tr'), self.css_func, 'td::text'))),
            'description': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(4)::text')
        }

Classes

class ChiHackNightSpider (name=None, **kwargs)

??? Base spider for reading websites that only need a single page load

Expand source code

class ChiHackNightSpider(ScraperSpider): 
    name = 'chihacknight'
    allowed_domains = ['chihacknight.org']
    enabled = False 

    def __init__(self, name=None, **kwargs):
        super().__init__(self, 'Chi Hack Night', 'https://chihacknight.org/', date_format='%b %d, %Y', **kwargs)

    def start_requests(self): 
        yield self.get_request('events/', {})
    
    def parse(self, response): 
        return { 
            'title': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(3) span::text'),
            'url': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(3) a::attr(href)'),
            #'event_time': 'January 1, 2022', 
            'event_time': self.create_time_data(
                date=self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(1) p::text', 'Jan 01, 2012')
            ),
            'address': list(map(lambda x: '222 Merchandise Mart Plaza, Chicago, IL 60654', self.empty_check_extract(response.css('table tr'), self.css_func, 'td::text'))),
            'description': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(4)::text')
        }

Ancestors

ScraperSpider
scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
SpiderBase
AggregatorBase

Class variables

var allowed_domains: Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

Methods

def parse(self, response)

Expand source code

def parse(self, response): 
    return { 
        'title': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(3) span::text'),
        'url': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(3) a::attr(href)'),
        #'event_time': 'January 1, 2022', 
        'event_time': self.create_time_data(
            date=self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(1) p::text', 'Jan 01, 2012')
        ),
        'address': list(map(lambda x: '222 Merchandise Mart Plaza, Chicago, IL 60654', self.empty_check_extract(response.css('table tr'), self.css_func, 'td::text'))),
        'description': self.empty_check_extract(response.css('table tr'), self.css_func, 'td:nth-child(4)::text')
    }

def start_requests(self)

Expand source code

def start_requests(self): 
    yield self.get_request('events/', {})

Inherited members

ScraperSpider:
- custom_settings
- empty_check_extract
- enabled
- get_request
- item_filter
- name