Module event_processor.models.event

Expand source code
import scrapy
import re
import usaddress
from event_processor.util.time_utils import TimeUtils
from event_processor.util.data_utils import DataUtils
from scrapy.loader.processors import MapCompose, Compose, Join, TakeFirst
from scrapy.loader import ItemLoader

def custom_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join())

def numeric_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=TakeFirst())

def price_field():
    return scrapy.Field(input_processor=MapCompose(
            lambda value: value.replace('$', '') if type(value) == str else value,
            DataUtils.remove_html, float),
        output_processor=TakeFirst())

def url_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, lambda value: value.replace('//', '/').rstrip('//')),
    output_processor=Join())

def category_field():
    return scrapy.Field(output_processor=Join())

def address_field():
    def parse_address(value):
        parsed = usaddress.parse(value) 
        def default_or_empty(field, default):
            if any(i[0] for i in parsed if i[1] == field):
                return ''
            return default 
        city_append = default_or_empty("PlaceName", " Chicago, ")
        state_append = default_or_empty("StateName", "IL")
        return f'{value}{city_append}{state_append}' 

    return scrapy.Field(input_processor=MapCompose(
            DataUtils.remove_html,
            parse_address),
        output_processor=Join())

def date_field():
    def parse_date(value):
        date_format = value['date_format']
        time_utils = TimeUtils(date_format=date_format)
        date_obj = {**create_time_data(), **value}
        start_timestamp, end_timestamp = time_utils.get_timestamps(date_obj)
        return {
            'start_timestamp': start_timestamp,
            'end_timestamp': end_timestamp
        }

    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, parse_date), output_processor=TakeFirst())

def create_time_data():
    # When creating an event, you'll want to pass in the data that matches
    # how the data is formatted on the site you're pulling from
    return {
        # Use time if only one time is supplied for the event (not time range)
        'time': None,
        # Use start_time and end_time if the site supplies distinct data for these two values
        'start_time': None,
        'end_time': None,
        # Use time_range if the start and end time is supplied in a single string ex: 6:00-8:00 PM
        'time_range': None,
        # Use date if the event could be one or multiple days but it is contained in a single string
        # This is done this way because some sites have data that could be single days or multiple days
        'date': None,
        # Use start_date and end_date if the site supplies distinct data for these two values
        'start_date': None,
        'end_date': None,
        # Use start_timestamp and end_timestamp if the data is formatted like a Unix timestamp
        'start_timestamp': None,
        'end_timestamp': None
    }

class Event(scrapy.Item):
    organization = custom_field()
    title = custom_field()
    description = custom_field()
    address = address_field()
    lat = numeric_field()
    lon = numeric_field()
    url = url_field()
    price = price_field()
    category = category_field()
    event_time = date_field()
    geocode_id = scrapy.Field()

class EventLoader():
    def __init__(self, *args, **kwargs):
        item_loader = ItemLoader(item=Event())
        for key, value in kwargs.items():
            try:
                item_loader.add_value(key, value)
            except KeyError:
                raise KeyError(f'{key} is not a valid event field')
        self.item = item_loader.load_item()

class EventManager:
    def __init__(self):
        self.events = {}

    def update(self, key, event):
        # Add properties to the event if it has been created already, else create a new event
        if key in self.events:
            self.events[key].update(event)
        else:
            self.events[key] = event

    def to_dicts(self):
        return [dict(event) for event in list(self.events.values())]

Functions

def address_field()
Expand source code
def address_field():
    def parse_address(value):
        parsed = usaddress.parse(value) 
        def default_or_empty(field, default):
            if any(i[0] for i in parsed if i[1] == field):
                return ''
            return default 
        city_append = default_or_empty("PlaceName", " Chicago, ")
        state_append = default_or_empty("StateName", "IL")
        return f'{value}{city_append}{state_append}' 

    return scrapy.Field(input_processor=MapCompose(
            DataUtils.remove_html,
            parse_address),
        output_processor=Join())
def category_field()
Expand source code
def category_field():
    return scrapy.Field(output_processor=Join())
def create_time_data()
Expand source code
def create_time_data():
    # When creating an event, you'll want to pass in the data that matches
    # how the data is formatted on the site you're pulling from
    return {
        # Use time if only one time is supplied for the event (not time range)
        'time': None,
        # Use start_time and end_time if the site supplies distinct data for these two values
        'start_time': None,
        'end_time': None,
        # Use time_range if the start and end time is supplied in a single string ex: 6:00-8:00 PM
        'time_range': None,
        # Use date if the event could be one or multiple days but it is contained in a single string
        # This is done this way because some sites have data that could be single days or multiple days
        'date': None,
        # Use start_date and end_date if the site supplies distinct data for these two values
        'start_date': None,
        'end_date': None,
        # Use start_timestamp and end_timestamp if the data is formatted like a Unix timestamp
        'start_timestamp': None,
        'end_timestamp': None
    }
def custom_field()
Expand source code
def custom_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join())
def date_field()
Expand source code
def date_field():
    def parse_date(value):
        date_format = value['date_format']
        time_utils = TimeUtils(date_format=date_format)
        date_obj = {**create_time_data(), **value}
        start_timestamp, end_timestamp = time_utils.get_timestamps(date_obj)
        return {
            'start_timestamp': start_timestamp,
            'end_timestamp': end_timestamp
        }

    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, parse_date), output_processor=TakeFirst())
def numeric_field()
Expand source code
def numeric_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=TakeFirst())
def price_field()
Expand source code
def price_field():
    return scrapy.Field(input_processor=MapCompose(
            lambda value: value.replace('$', '') if type(value) == str else value,
            DataUtils.remove_html, float),
        output_processor=TakeFirst())
def url_field()
Expand source code
def url_field():
    return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html, lambda value: value.replace('//', '/').rstrip('//')),
    output_processor=Join())

Classes

class Event (*args, **kwargs)

Base class for all scraped items.

In Scrapy, an object is considered an item if it is an instance of either :class:BaseItem or :class:dict. For example, when the output of a spider callback is evaluated, only instances of :class:BaseItem or :class:dict are passed to :ref:item pipelines <topics-item-pipeline>.

If you need instances of a custom class to be considered items by Scrapy, you must inherit from either :class:BaseItem or :class:dict.

Unlike instances of :class:dict, instances of :class:BaseItem may be :ref:tracked <topics-leaks-trackrefs> to debug memory leaks.

Expand source code
class Event(scrapy.Item):
    organization = custom_field()
    title = custom_field()
    description = custom_field()
    address = address_field()
    lat = numeric_field()
    lon = numeric_field()
    url = url_field()
    price = price_field()
    category = category_field()
    event_time = date_field()
    geocode_id = scrapy.Field()

Ancestors

  • scrapy.item.Item
  • scrapy.item.DictItem
  • collections.abc.MutableMapping
  • collections.abc.Mapping
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container
  • scrapy.item.BaseItem
  • scrapy.utils.trackref.object_ref

Class variables

var fields

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

class EventLoader (*args, **kwargs)
Expand source code
class EventLoader():
    def __init__(self, *args, **kwargs):
        item_loader = ItemLoader(item=Event())
        for key, value in kwargs.items():
            try:
                item_loader.add_value(key, value)
            except KeyError:
                raise KeyError(f'{key} is not a valid event field')
        self.item = item_loader.load_item()
class EventManager
Expand source code
class EventManager:
    def __init__(self):
        self.events = {}

    def update(self, key, event):
        # Add properties to the event if it has been created already, else create a new event
        if key in self.events:
            self.events[key].update(event)
        else:
            self.events[key] = event

    def to_dicts(self):
        return [dict(event) for event in list(self.events.values())]

Methods

def to_dicts(self)
Expand source code
def to_dicts(self):
    return [dict(event) for event in list(self.events.values())]
def update(self, key, event)
Expand source code
def update(self, key, event):
    # Add properties to the event if it has been created already, else create a new event
    if key in self.events:
        self.events[key].update(event)
    else:
        self.events[key] = event