Module event_processor.scrapers.wpbcc_spider
Expand source code
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Rule
from event_processor.base.custom_spiders import ScraperCrawlSpider
from scrapy.linkextractors import LinkExtractor
class WpbccSpider(ScraperCrawlSpider):
name = 'wpbcc'
allowed_domains = ['www.wickerparkbucktown.com']
rules = (
Rule(LinkExtractor(restrict_css = ('.prevnextLink')), callback = 'parse_start_url', follow = True),
)
def __init__(self, name=None, **kwargs):
super().__init__(self, 'Wicker Park and Bucktown Chamber of Commerce', 'http://www.wickerparkbucktown.com/', date_format = '%B %d, %Y', **kwargs)
def start_requests(self):
yield self.get_request('events/', {
'mrkrs': 'Chamber'
})
def parse_start_url(self, response):
base_selector = response.css('.listerContent')
def sibling_extract(field):
return self.empty_check_extract(base_selector, self.xpath_func, f'div/span[contains(text(), "{field}: ")]/following-sibling::text()')
return {
'title': response.css('.listerItem h2 a::text').extract(),
'url': response.css('.listerItem h2 a::attr(href)').extract(),
'event_time': self.create_time_data(time_range=sibling_extract('Time'), date=sibling_extract('Date')),
'address': sibling_extract('Address'),
'description': self.empty_check_extract(base_selector, self.css_func, '.blurb::text')
}
Classes
class WpbccSpider (name=None, **kwargs)
-
??? Base spider for reading websites that may require further crawling and multiple page loads
Expand source code
class WpbccSpider(ScraperCrawlSpider): name = 'wpbcc' allowed_domains = ['www.wickerparkbucktown.com'] rules = ( Rule(LinkExtractor(restrict_css = ('.prevnextLink')), callback = 'parse_start_url', follow = True), ) def __init__(self, name=None, **kwargs): super().__init__(self, 'Wicker Park and Bucktown Chamber of Commerce', 'http://www.wickerparkbucktown.com/', date_format = '%B %d, %Y', **kwargs) def start_requests(self): yield self.get_request('events/', { 'mrkrs': 'Chamber' }) def parse_start_url(self, response): base_selector = response.css('.listerContent') def sibling_extract(field): return self.empty_check_extract(base_selector, self.xpath_func, f'div/span[contains(text(), "{field}: ")]/following-sibling::text()') return { 'title': response.css('.listerItem h2 a::text').extract(), 'url': response.css('.listerItem h2 a::attr(href)').extract(), 'event_time': self.create_time_data(time_range=sibling_extract('Time'), date=sibling_extract('Date')), 'address': sibling_extract('Address'), 'description': self.empty_check_extract(base_selector, self.css_func, '.blurb::text') }
Ancestors
- ScraperCrawlSpider
- scrapy.spiders.crawl.CrawlSpider
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- SpiderBase
- AggregatorBase
Class variables
var allowed_domains
-
Built-in mutable sequence.
If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.
var rules
-
Built-in immutable sequence.
If no argument is given, the constructor returns an empty tuple. If iterable is specified the tuple is initialized from iterable's items.
If the argument is a tuple, the return value is the same object.
Methods
def parse_start_url(self, response)
-
Expand source code
def parse_start_url(self, response): base_selector = response.css('.listerContent') def sibling_extract(field): return self.empty_check_extract(base_selector, self.xpath_func, f'div/span[contains(text(), "{field}: ")]/following-sibling::text()') return { 'title': response.css('.listerItem h2 a::text').extract(), 'url': response.css('.listerItem h2 a::attr(href)').extract(), 'event_time': self.create_time_data(time_range=sibling_extract('Time'), date=sibling_extract('Date')), 'address': sibling_extract('Address'), 'description': self.empty_check_extract(base_selector, self.css_func, '.blurb::text') }
def start_requests(self)
-
Expand source code
def start_requests(self): yield self.get_request('events/', { 'mrkrs': 'Chamber' })
Inherited members