Module event_processor.base.custom_spiders
Expand source code
from scrapy.spiders import Spider, CrawlSpider
from event_processor.base.spider_base import SpiderBase
from event_processor.base.api_base import ApiBase
from event_processor.base.splash_base import SplashBase
api_settings = {
'ITEM_PIPELINES': {
'event_processor.scrapy_impl.pipelines.EventTransformPipeline': 300,
'event_processor.scrapy_impl.pipelines.GeocodePipeline': 400,
'event_processor.scrapy_impl.pipelines.EventBuildPipeline': 500,
'event_processor.scrapy_impl.pipelines.EventSavePipeline': 600
}
}
scraper_settings = {
'ITEM_PIPELINES': {
'event_processor.scrapy_impl.pipelines.EventTransformPipeline': 300,
'event_processor.scrapy_impl.pipelines.GeocodePipeline': 400,
'event_processor.scrapy_impl.pipelines.EventBuildPipeline': 500,
'event_processor.scrapy_impl.pipelines.EventSavePipeline': 600
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
'event_processor.scrapy_impl.middlewares.SplitItemsMiddleware': 400
},
'DUPEFILTER_CLASS' : 'scrapy_splash.SplashAwareDupeFilter',
'HTTPCACHE_STORAGE' : 'scrapy_splash.SplashAwareFSCacheStorage',
'DOWNLOADER_MIDDLEWARES' : {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
}
no_transpose_scraper_settings = {
'ITEM_PIPELINES': {
'event_processor.scrapy_impl.pipelines.EventTransformPipeline': 300,
'event_processor.scrapy_impl.pipelines.GeocodePipeline': 400,
'event_processor.scrapy_impl.pipelines.EventBuildPipeline': 500,
'event_processor.scrapy_impl.pipelines.EventSavePipeline': 600
}
}
class ApiSpider(Spider, ApiBase):
"""Base spider for reading Apis"""
custom_settings = api_settings
def __init__(self, *args, **kwargs):
Spider.__init__(self)
ApiBase.__init__(*args, **kwargs)
class ScraperSpider(Spider, SpiderBase):
"""??? Base spider for reading websites that only need a single page load"""
custom_settings = scraper_settings
def __init__(self, *args, **kwargs):
Spider.__init__(self)
SpiderBase.__init__(*args, **kwargs)
class ScraperCrawlSpider(CrawlSpider, SpiderBase):
"""??? Base spider for reading websites that may require further crawling and multiple page loads"""
custom_settings = scraper_settings
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self)
SpiderBase.__init__(*args, **kwargs)
class ScraperSplashSpider(Spider, SplashBase):
"""??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content"""
custom_settings = scraper_settings
def __init__(self, *args, **kwargs):
Spider.__init__(self)
SplashBase.__init__(*args, **kwargs)
class ScraperNoTransposeSpider(Spider, SpiderBase):
custom_settings = no_transpose_scraper_settings
def __init__(self, *args, **kwargs):
Spider.__init__(self)
SpiderBase.__init__(*args, **kwargs)
Classes
class ApiSpider (*args, **kwargs)
-
Base spider for reading Apis
Expand source code
class ApiSpider(Spider, ApiBase): """Base spider for reading Apis""" custom_settings = api_settings def __init__(self, *args, **kwargs): Spider.__init__(self) ApiBase.__init__(*args, **kwargs)
Ancestors
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- ApiBase
- AggregatorBase
Subclasses
Class variables
var custom_settings
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
Inherited members
class ScraperCrawlSpider (*args, **kwargs)
-
??? Base spider for reading websites that may require further crawling and multiple page loads
Expand source code
class ScraperCrawlSpider(CrawlSpider, SpiderBase): """??? Base spider for reading websites that may require further crawling and multiple page loads""" custom_settings = scraper_settings def __init__(self, *args, **kwargs): CrawlSpider.__init__(self) SpiderBase.__init__(*args, **kwargs)
Ancestors
- scrapy.spiders.crawl.CrawlSpider
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- SpiderBase
- AggregatorBase
Subclasses
Class variables
var custom_settings
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
Inherited members
class ScraperNoTransposeSpider (*args, **kwargs)
-
Base class for scrapy spiders. All spiders must inherit from this class.
Expand source code
class ScraperNoTransposeSpider(Spider, SpiderBase): custom_settings = no_transpose_scraper_settings def __init__(self, *args, **kwargs): Spider.__init__(self) SpiderBase.__init__(*args, **kwargs)
Ancestors
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- SpiderBase
- AggregatorBase
Subclasses
Class variables
var custom_settings
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
Inherited members
class ScraperSpider (*args, **kwargs)
-
??? Base spider for reading websites that only need a single page load
Expand source code
class ScraperSpider(Spider, SpiderBase): """??? Base spider for reading websites that only need a single page load""" custom_settings = scraper_settings def __init__(self, *args, **kwargs): Spider.__init__(self) SpiderBase.__init__(*args, **kwargs)
Ancestors
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- SpiderBase
- AggregatorBase
Subclasses
Class variables
var custom_settings
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
Inherited members
class ScraperSplashSpider (*args, **kwargs)
-
??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content
Expand source code
class ScraperSplashSpider(Spider, SplashBase): """??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content""" custom_settings = scraper_settings def __init__(self, *args, **kwargs): Spider.__init__(self) SplashBase.__init__(*args, **kwargs)
Ancestors
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- SplashBase
- SpiderBase
- AggregatorBase
Subclasses
Class variables
var custom_settings
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
Inherited members