Module event_processor.scrapers.fpcc_spider
Expand source code
import json
import time
import re
from event_processor.base.custom_spiders import ScraperSplashSpider
from event_processor.config import config
from scrapy_splash import SplashRequest
class FpccSpider(ScraperSplashSpider):
allowed_domains = ['ec.samaritan.com', 'splash']
start_urls = ['https://ec.samaritan.com/recruiter/index.php?class=RecruiterCalendar&recruiterID=1405']
name = "fpcc"
enabled = True
def __init__(self, *args, **kwargs):
super().__init__(self, 'Forest Preserves of Cook County', 'http://splash:8050/', date_format = '%d %b %Y', **kwargs)
def start_requests(self):
"""Start the request as a splash request"""
script_to_use = self.construct_lua_click_script( \
btn_selector='.rCalendar_tab_header', \
content_selector='.rCalendar_tab_content:not([style*="none"])', \
detail_selectors={
'title' : '{btn}',
'description' : '{cnt} table tr:nth-child(2)',
'date_unparsed' : '{cnt} table tr:nth-child(6) p:nth-child(1)',
'address' : '{cnt} table tr:nth-child(8) div'
},
after_click_wait=3 # wait 3 seconds after clicking on the button before trying to extract content
)
for url in self.start_urls:
for res in self.get_splash_requests(url, script_to_use, 8):
yield res
def splash_parse_response(self, response):
# extract the two times
# get all the strings before the two times
if 'title' in response:
find_start_time = ''
find_end_time = ''
time_find_index = -1
extract_times = re.findall("([0-9]?[0-9]:[0-9][0-9] [ap]m)", response['date_unparsed'])
if len(extract_times) > 0:
find_start_time = extract_times[0]
extract_date = response['date_unparsed']
time_find_index = response['date_unparsed'].index(extract_times[0])
if len(extract_times) > 1:
find_end_time = extract_times[1]
if time_find_index >= 0:
extract_date = response['date_unparsed'][:(time_find_index - 1)]
return {
'title' : [ response['title'] ],
'url' : [ response['url'] ],
'description' : [ response['description'] ],
'event_time' : self.create_time_data(
date=[ extract_date ],
start_time=[ find_start_time ],
end_time=[ find_end_time ]
),
'address' : [ response['address'].replace("View map", "") ],
'category' : [ 'Environment' ]
}
return None # when title is not in response, then there was an error extracting data
Classes
class FpccSpider (*args, **kwargs)
-
??? Base spider for web crawling with Splash, which can render and extract data from pages that have javascript generated dynamic content
Expand source code
class FpccSpider(ScraperSplashSpider): allowed_domains = ['ec.samaritan.com', 'splash'] start_urls = ['https://ec.samaritan.com/recruiter/index.php?class=RecruiterCalendar&recruiterID=1405'] name = "fpcc" enabled = True def __init__(self, *args, **kwargs): super().__init__(self, 'Forest Preserves of Cook County', 'http://splash:8050/', date_format = '%d %b %Y', **kwargs) def start_requests(self): """Start the request as a splash request""" script_to_use = self.construct_lua_click_script( \ btn_selector='.rCalendar_tab_header', \ content_selector='.rCalendar_tab_content:not([style*="none"])', \ detail_selectors={ 'title' : '{btn}', 'description' : '{cnt} table tr:nth-child(2)', 'date_unparsed' : '{cnt} table tr:nth-child(6) p:nth-child(1)', 'address' : '{cnt} table tr:nth-child(8) div' }, after_click_wait=3 # wait 3 seconds after clicking on the button before trying to extract content ) for url in self.start_urls: for res in self.get_splash_requests(url, script_to_use, 8): yield res def splash_parse_response(self, response): # extract the two times # get all the strings before the two times if 'title' in response: find_start_time = '' find_end_time = '' time_find_index = -1 extract_times = re.findall("([0-9]?[0-9]:[0-9][0-9] [ap]m)", response['date_unparsed']) if len(extract_times) > 0: find_start_time = extract_times[0] extract_date = response['date_unparsed'] time_find_index = response['date_unparsed'].index(extract_times[0]) if len(extract_times) > 1: find_end_time = extract_times[1] if time_find_index >= 0: extract_date = response['date_unparsed'][:(time_find_index - 1)] return { 'title' : [ response['title'] ], 'url' : [ response['url'] ], 'description' : [ response['description'] ], 'event_time' : self.create_time_data( date=[ extract_date ], start_time=[ find_start_time ], end_time=[ find_end_time ] ), 'address' : [ response['address'].replace("View map", "") ], 'category' : [ 'Environment' ] } return None # when title is not in response, then there was an error extracting data
Ancestors
- ScraperSplashSpider
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
- SplashBase
- SpiderBase
- AggregatorBase
Class variables
var allowed_domains
-
Built-in mutable sequence.
If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.
var start_urls
-
Built-in mutable sequence.
If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.
Methods
def splash_parse_response(self, response)
-
Expand source code
def splash_parse_response(self, response): # extract the two times # get all the strings before the two times if 'title' in response: find_start_time = '' find_end_time = '' time_find_index = -1 extract_times = re.findall("([0-9]?[0-9]:[0-9][0-9] [ap]m)", response['date_unparsed']) if len(extract_times) > 0: find_start_time = extract_times[0] extract_date = response['date_unparsed'] time_find_index = response['date_unparsed'].index(extract_times[0]) if len(extract_times) > 1: find_end_time = extract_times[1] if time_find_index >= 0: extract_date = response['date_unparsed'][:(time_find_index - 1)] return { 'title' : [ response['title'] ], 'url' : [ response['url'] ], 'description' : [ response['description'] ], 'event_time' : self.create_time_data( date=[ extract_date ], start_time=[ find_start_time ], end_time=[ find_end_time ] ), 'address' : [ response['address'].replace("View map", "") ], 'category' : [ 'Environment' ] } return None # when title is not in response, then there was an error extracting data
def start_requests(self)
-
Start the request as a splash request
Expand source code
def start_requests(self): """Start the request as a splash request""" script_to_use = self.construct_lua_click_script( \ btn_selector='.rCalendar_tab_header', \ content_selector='.rCalendar_tab_content:not([style*="none"])', \ detail_selectors={ 'title' : '{btn}', 'description' : '{cnt} table tr:nth-child(2)', 'date_unparsed' : '{cnt} table tr:nth-child(6) p:nth-child(1)', 'address' : '{cnt} table tr:nth-child(8) div' }, after_click_wait=3 # wait 3 seconds after clicking on the button before trying to extract content ) for url in self.start_urls: for res in self.get_splash_requests(url, script_to_use, 8): yield res
Inherited members