eventcally/project/scrape/scrape_hi.py
Daniel Grams d63f340384 Internal/modules (#1)
* Restructured app
* Added Travis CI
* Added 'Deploy to heroku' button
2020-11-13 12:24:26 +01:00

193 lines
7.3 KiB
Python

from project import app, db
from pprint import pprint
import datetime
from dateutil import parser, tz
import pytz
from urllib import request, parse
from urllib.request import urlopen, URLError
from bs4 import BeautifulSoup
import requests
from os import path
import json
from flask import jsonify
import re
import unicodedata
import decimal
from project.models import EventReviewStatus, EventTargetGroupOrigin, Location, Event, EventStatus, EventCategory, EventPlace, EventOrganizer, AdminUnit
from sqlalchemy import and_, or_, not_
from project.dateutils import berlin_tz
from project.services.admin_unit import get_admin_unit
from project.services.event import upsert_event_category, update_event_dates_with_recurrence_rule
admin_unit = get_admin_unit('Harzinfo')
category = upsert_event_category('Other')
base_url = "https://www.harzinfo.de"
url = base_url + "/?ndssearch=fullsearch&no_cache=1&L=0"
with open('scrape_hi_req.json') as json_file:
request_object = json.load(json_file)
with open('scrape_hi_cities.json') as json_file:
cities = json.load(json_file)
def response_from_url(city):
body = request_object
body["searchFilter"]["ndsdestinationdataevent"]["city"] = { str(city['id']): city['short_name'] or city['title'] }
req = request.Request(url, data=bytes(json.dumps(body), encoding='utf-8'))
req.add_header('Content-Type', 'application/json')
return request.urlopen(req)
def load_json(debug, city):
if debug:
filename = "tmp/hi_%d.html" % (city['id'])
if not path.exists(filename):
response = response_from_url(city)
with open(filename, "wb") as text_file:
text_file.write(response.read())
with open(filename) as json_file:
return json.load(json_file)
else:
response = response_from_url(city)
return json.load(response)
def parse_date_time_str(date_time_str):
if not date_time_str:
return None
return datetime.datetime.fromisoformat(date_time_str + ':00')
def scrape(debug, city):
# Organizer
organizer_name = city['short_name'] or city['title']
organizer = EventOrganizer.query.filter(and_(
EventOrganizer.admin_unit_id == admin_unit.id,
EventOrganizer.name == organizer_name)).first()
if organizer is None:
organizer = EventOrganizer(
admin_unit_id = admin_unit.id,
name = organizer_name)
db.session.add(organizer)
db.session.commit()
print(organizer_name)
response = load_json(debug, city)
result = response["result"]
event_ids = list()
for item in result:
try:
uid = str(item["uid"])
external_link = base_url + item["link"] + '#' + uid
event = Event.query.filter(and_(Event.organizer_id == organizer.id, Event.external_link == external_link)).first()
did_create = False
if event is None:
event = Event()
event.admin_unit = admin_unit
event.organizer = organizer
did_create = True
event_ids.append(event.id)
event.category = category
event.external_link = external_link
event.review_status = EventReviewStatus.verified
event.rating = int(item["rating"])
event.name = item["title"]
event.description = item["title"]
start = parse_date_time_str(item['date'])
update_event_dates_with_recurrence_rule(event, start, None)
# Place
place_name = item["location"]
place_description = ""
place_location = None
if 'latitude' in item and 'longitude' in item:
meeting_point_latitude = item['latitude']
meeting_point_longitude = item['longitude']
if meeting_point_latitude and meeting_point_longitude:
latitude = decimal.Decimal(meeting_point_latitude)
longitude = decimal.Decimal(meeting_point_longitude)
if latitude != 0 and longitude != 0:
place_location = Location()
place_location.latitude = latitude
place_location.longitude = longitude
place = EventPlace.query.filter(and_(
EventPlace.admin_unit_id == admin_unit.id,
EventPlace.organizer_id == organizer.id,
EventPlace.name == place_name)).first()
if place is None:
place = EventPlace(
admin_unit_id = admin_unit.id,
organizer_id = organizer.id,
name = place_name)
place.description = place_description
place.location = place_location
event.event_place = place
# Additional data
event.status = EventStatus.cancelled if item['canceled'] else EventStatus.scheduled
if 'categories' in item:
tag_list = list(item['categories'].values())
if 'Ausstellung/Kunst' in tag_list:
event.category = upsert_event_category('Art')
elif 'Comedy' in tag_list:
event.category = upsert_event_category('Comedy')
elif 'Konzert/Musik' in tag_list:
event.category = upsert_event_category('Music')
elif 'Theater' in tag_list:
event.category = upsert_event_category('Theater')
elif 'Genuss/Gourmet' in tag_list:
event.category = upsert_event_category('Dining')
elif 'Gesundheit/Wellness' in tag_list:
event.category = upsert_event_category('Fitness')
elif 'Kinder/Jugend' in tag_list:
event.category = upsert_event_category('Family')
elif 'Markt/Flohmarkt' in tag_list:
event.category = upsert_event_category('Shopping')
elif 'Sport' in tag_list:
event.category = upsert_event_category('Sports')
elif 'Vortrag/Lesung' in tag_list:
event.category = upsert_event_category('Book')
elif 'Kabarett' in tag_list:
event.category = upsert_event_category('Art')
elif 'Musical' in tag_list:
event.category = upsert_event_category('Theater')
elif 'Weihnachtsmärkte' in tag_list:
event.category = upsert_event_category('Festival')
elif 'Stadt- und Volksfeste' in tag_list:
event.category = upsert_event_category('Festival')
if 'Kinder/Jugend' in tag_list:
event.kid_friendly = True
tag_list.append('Harzinfo')
event.tags = ','.join(tag_list)
print("%s %s %d" % (event.dates[0].start, event.name, event.rating))
if did_create:
db.session.add(event)
db.session.commit()
except:
print("Exception")
pprint(item)
Event.query.filter(and_(Event.admin_unit_id == admin_unit.id, Event.organizer_id == organizer.id, not_(Event.id.in_(event_ids)))).delete(synchronize_session='fetch')
db.session.commit()
if __name__ == '__main__':
for city in cities.values():
scrape(False, city)