Module `event_processor.base.custom_spiders`

Expand source code

from scrapy.spiders import Spider, CrawlSpider
from event_processor.base.spider_base import SpiderBase
from event_processor.base.api_base import ApiBase
from event_processor.base.splash_base import SplashBase
api_settings = {
        'ITEM_PIPELINES': {
            'event_processor.scrapy_impl.pipelines.EventTransformPipeline': 300,
            'event_processor.scrapy_impl.pipelines.GeocodePipeline': 400,
            'event_processor.scrapy_impl.pipelines.EventBuildPipeline': 500,
            'event_processor.scrapy_impl.pipelines.EventSavePipeline': 600
        }
    }
scraper_settings = {
        'ITEM_PIPELINES': {
            'event_processor.scrapy_impl.pipelines.EventTransformPipeline': 300,
            'event_processor.scrapy_impl.pipelines.GeocodePipeline': 400,
            'event_processor.scrapy_impl.pipelines.EventBuildPipeline': 500,
            'event_processor.scrapy_impl.pipelines.EventSavePipeline': 600
        },
        'SPIDER_MIDDLEWARES': {
            'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
            'event_processor.scrapy_impl.middlewares.SplitItemsMiddleware': 400
        },
        'DUPEFILTER_CLASS' : 'scrapy_splash.SplashAwareDupeFilter',
        'HTTPCACHE_STORAGE' : 'scrapy_splash.SplashAwareFSCacheStorage',
        'DOWNLOADER_MIDDLEWARES' : { 
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
        }
    }

no_transpose_scraper_settings  = {
        'ITEM_PIPELINES': {
            'event_processor.scrapy_impl.pipelines.EventTransformPipeline': 300,
            'event_processor.scrapy_impl.pipelines.GeocodePipeline': 400,
            'event_processor.scrapy_impl.pipelines.EventBuildPipeline': 500,
            'event_processor.scrapy_impl.pipelines.EventSavePipeline': 600
        }
    }

class ApiSpider(Spider, ApiBase):
    """Base spider for reading Apis"""
    custom_settings = api_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        ApiBase.__init__(*args, **kwargs)

class ScraperSpider(Spider, SpiderBase):
    """??? Base spider for reading websites that only need a single page load"""
    custom_settings = scraper_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        SpiderBase.__init__(*args, **kwargs)

class ScraperCrawlSpider(CrawlSpider, SpiderBase):
    """??? Base spider for reading websites that may require further crawling and multiple page loads"""
    custom_settings = scraper_settings
    def __init__(self, *args, **kwargs):
        CrawlSpider.__init__(self)
        SpiderBase.__init__(*args, **kwargs)

class ScraperSplashSpider(Spider, SplashBase):
    """??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content"""
    custom_settings = scraper_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        SplashBase.__init__(*args, **kwargs)

class ScraperNoTransposeSpider(Spider, SpiderBase):
    custom_settings = no_transpose_scraper_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        SpiderBase.__init__(*args, **kwargs)

Classes

class ApiSpider (*args, **kwargs)

Base spider for reading Apis

Expand source code

class ApiSpider(Spider, ApiBase):
    """Base spider for reading Apis"""
    custom_settings = api_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        ApiBase.__init__(*args, **kwargs)

Ancestors

scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
ApiBase
AggregatorBase

Subclasses

Class variables

var custom_settings: dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Inherited members

ApiBase:
- allowed_domains
- enabled
- item_filter
- name
- start_urls

class ScraperCrawlSpider (*args, **kwargs)

??? Base spider for reading websites that may require further crawling and multiple page loads

Expand source code

class ScraperCrawlSpider(CrawlSpider, SpiderBase):
    """??? Base spider for reading websites that may require further crawling and multiple page loads"""
    custom_settings = scraper_settings
    def __init__(self, *args, **kwargs):
        CrawlSpider.__init__(self)
        SpiderBase.__init__(*args, **kwargs)

Ancestors

scrapy.spiders.crawl.CrawlSpider
scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
SpiderBase
AggregatorBase

Subclasses

Class variables

var custom_settings: dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Inherited members

SpiderBase:
- empty_check_extract
- enabled
- get_request
- item_filter
- name

class ScraperNoTransposeSpider (*args, **kwargs)

Base class for scrapy spiders. All spiders must inherit from this class.

Expand source code

class ScraperNoTransposeSpider(Spider, SpiderBase):
    custom_settings = no_transpose_scraper_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        SpiderBase.__init__(*args, **kwargs)

Ancestors

scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
SpiderBase
AggregatorBase

Subclasses

ForMyBlockSpider

Class variables

var custom_settings: dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Inherited members

SpiderBase:
- empty_check_extract
- enabled
- get_request
- item_filter
- name

class ScraperSpider (*args, **kwargs)

??? Base spider for reading websites that only need a single page load

Expand source code

class ScraperSpider(Spider, SpiderBase):
    """??? Base spider for reading websites that only need a single page load"""
    custom_settings = scraper_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        SpiderBase.__init__(*args, **kwargs)

Ancestors

scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
SpiderBase
AggregatorBase

Subclasses

Class variables

var custom_settings: dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Inherited members

SpiderBase:
- empty_check_extract
- enabled
- get_request
- item_filter
- name

class ScraperSplashSpider (*args, **kwargs)

??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content

Expand source code

class ScraperSplashSpider(Spider, SplashBase):
    """??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content"""
    custom_settings = scraper_settings
    def __init__(self, *args, **kwargs):
        Spider.__init__(self)
        SplashBase.__init__(*args, **kwargs)

Ancestors

scrapy.spiders.Spider
scrapy.utils.trackref.object_ref
SplashBase
SpiderBase
AggregatorBase

Subclasses

FpccSpider

Class variables

var custom_settings: dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Inherited members

SplashBase:
- construct_lua_click_script
- empty_check_extract
- enabled
- get_request
- item_filter
- name
- splash_parse