Module event_processor.models.event
Expand source code
import scrapy
import re
import usaddress
from event_processor.util.time_utils import TimeUtils
from event_processor.util.data_utils import DataUtils
from scrapy.loader.processors import MapCompose, Compose, Join, TakeFirst
from scrapy.loader import ItemLoader
def custom_field():
return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join())
def numeric_field():
return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=TakeFirst())
def price_field():
return scrapy.Field(input_processor=MapCompose(
lambda value: value.replace('$', '') if type(value) == str else value,
DataUtils.remove_html, float),
output_processor=TakeFirst())
def url_field():
return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, lambda value: value.replace('//', '/').rstrip('//')),
output_processor=Join())
def category_field():
return scrapy.Field(output_processor=Join())
def address_field():
def parse_address(value):
parsed = usaddress.parse(value)
def default_or_empty(field, default):
if any(i[0] for i in parsed if i[1] == field):
return ''
return default
city_append = default_or_empty("PlaceName", " Chicago, ")
state_append = default_or_empty("StateName", "IL")
return f'{value}{city_append}{state_append}'
return scrapy.Field(input_processor=MapCompose(
DataUtils.remove_html,
parse_address),
output_processor=Join())
def date_field():
def parse_date(value):
date_format = value['date_format']
time_utils = TimeUtils(date_format=date_format)
date_obj = {**create_time_data(), **value}
start_timestamp, end_timestamp = time_utils.get_timestamps(date_obj)
return {
'start_timestamp': start_timestamp,
'end_timestamp': end_timestamp
}
return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, parse_date), output_processor=TakeFirst())
def create_time_data():
# When creating an event, you'll want to pass in the data that matches
# how the data is formatted on the site you're pulling from
return {
# Use time if only one time is supplied for the event (not time range)
'time': None,
# Use start_time and end_time if the site supplies distinct data for these two values
'start_time': None,
'end_time': None,
# Use time_range if the start and end time is supplied in a single string ex: 6:00-8:00 PM
'time_range': None,
# Use date if the event could be one or multiple days but it is contained in a single string
# This is done this way because some sites have data that could be single days or multiple days
'date': None,
# Use start_date and end_date if the site supplies distinct data for these two values
'start_date': None,
'end_date': None,
# Use start_timestamp and end_timestamp if the data is formatted like a Unix timestamp
'start_timestamp': None,
'end_timestamp': None
}
class Event(scrapy.Item):
organization = custom_field()
title = custom_field()
description = custom_field()
address = address_field()
lat = numeric_field()
lon = numeric_field()
url = url_field()
price = price_field()
category = category_field()
event_time = date_field()
geocode_id = scrapy.Field()
class EventLoader():
def __init__(self, *args, **kwargs):
item_loader = ItemLoader(item=Event())
for key, value in kwargs.items():
try:
item_loader.add_value(key, value)
except KeyError:
raise KeyError(f'{key} is not a valid event field')
self.item = item_loader.load_item()
class EventManager:
def __init__(self):
self.events = {}
def update(self, key, event):
# Add properties to the event if it has been created already, else create a new event
if key in self.events:
self.events[key].update(event)
else:
self.events[key] = event
def to_dicts(self):
return [dict(event) for event in list(self.events.values())]
Functions
def address_field()
-
Expand source code
def address_field(): def parse_address(value): parsed = usaddress.parse(value) def default_or_empty(field, default): if any(i[0] for i in parsed if i[1] == field): return '' return default city_append = default_or_empty("PlaceName", " Chicago, ") state_append = default_or_empty("StateName", "IL") return f'{value}{city_append}{state_append}' return scrapy.Field(input_processor=MapCompose( DataUtils.remove_html, parse_address), output_processor=Join())
def category_field()
-
Expand source code
def category_field(): return scrapy.Field(output_processor=Join())
def create_time_data()
-
Expand source code
def create_time_data(): # When creating an event, you'll want to pass in the data that matches # how the data is formatted on the site you're pulling from return { # Use time if only one time is supplied for the event (not time range) 'time': None, # Use start_time and end_time if the site supplies distinct data for these two values 'start_time': None, 'end_time': None, # Use time_range if the start and end time is supplied in a single string ex: 6:00-8:00 PM 'time_range': None, # Use date if the event could be one or multiple days but it is contained in a single string # This is done this way because some sites have data that could be single days or multiple days 'date': None, # Use start_date and end_date if the site supplies distinct data for these two values 'start_date': None, 'end_date': None, # Use start_timestamp and end_timestamp if the data is formatted like a Unix timestamp 'start_timestamp': None, 'end_timestamp': None }
def custom_field()
-
Expand source code
def custom_field(): return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join())
def date_field()
-
Expand source code
def date_field(): def parse_date(value): date_format = value['date_format'] time_utils = TimeUtils(date_format=date_format) date_obj = {**create_time_data(), **value} start_timestamp, end_timestamp = time_utils.get_timestamps(date_obj) return { 'start_timestamp': start_timestamp, 'end_timestamp': end_timestamp } return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, parse_date), output_processor=TakeFirst())
def numeric_field()
-
Expand source code
def numeric_field(): return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=TakeFirst())
def price_field()
-
Expand source code
def price_field(): return scrapy.Field(input_processor=MapCompose( lambda value: value.replace('$', '') if type(value) == str else value, DataUtils.remove_html, float), output_processor=TakeFirst())
def url_field()
-
Expand source code
def url_field(): return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, lambda value: value.replace('//', '/').rstrip('//')), output_processor=Join())
Classes
class Event (*args, **kwargs)
-
Base class for all scraped items.
In Scrapy, an object is considered an item if it is an instance of either :class:
BaseItem
or :class:dict
. For example, when the output of a spider callback is evaluated, only instances of :class:BaseItem
or :class:dict
are passed to :ref:item pipelines <topics-item-pipeline>
.If you need instances of a custom class to be considered items by Scrapy, you must inherit from either :class:
BaseItem
or :class:dict
.Unlike instances of :class:
dict
, instances of :class:BaseItem
may be :ref:tracked <topics-leaks-trackrefs>
to debug memory leaks.Expand source code
class Event(scrapy.Item): organization = custom_field() title = custom_field() description = custom_field() address = address_field() lat = numeric_field() lon = numeric_field() url = url_field() price = price_field() category = category_field() event_time = date_field() geocode_id = scrapy.Field()
Ancestors
- scrapy.item.Item
- scrapy.item.DictItem
- collections.abc.MutableMapping
- collections.abc.Mapping
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
- scrapy.item.BaseItem
- scrapy.utils.trackref.object_ref
Class variables
var fields
-
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
class EventLoader (*args, **kwargs)
-
Expand source code
class EventLoader(): def __init__(self, *args, **kwargs): item_loader = ItemLoader(item=Event()) for key, value in kwargs.items(): try: item_loader.add_value(key, value) except KeyError: raise KeyError(f'{key} is not a valid event field') self.item = item_loader.load_item()
class EventManager
-
Expand source code
class EventManager: def __init__(self): self.events = {} def update(self, key, event): # Add properties to the event if it has been created already, else create a new event if key in self.events: self.events[key].update(event) else: self.events[key] = event def to_dicts(self): return [dict(event) for event in list(self.events.values())]
Methods
def to_dicts(self)
-
Expand source code
def to_dicts(self): return [dict(event) for event in list(self.events.values())]
def update(self, key, event)
-
Expand source code
def update(self, key, event): # Add properties to the event if it has been created already, else create a new event if key in self.events: self.events[key].update(event) else: self.events[key] = event