diff --git a/project/scrape/scrape_fp.py b/project/scrape/scrape_fp.py deleted file mode 100644 index 54dce4b..0000000 --- a/project/scrape/scrape_fp.py +++ /dev/null @@ -1,237 +0,0 @@ -from project import ( - db, - get_admin_unit_by_name, - update_event_dates_with_recurrence_rule, - upsert_event_category, -) -from pprint import pprint -import datetime -import pytz -from urllib.request import urlopen -from bs4 import BeautifulSoup -from os import path -import json -import re -import decimal -from project.models import ( - EventReviewStatus, - EventTargetGroupOrigin, - Location, - Event, - EventStatus, - EventPlace, - EventOrganizer, -) -from sqlalchemy import and_ - -berlin_tz = pytz.timezone("Europe/Berlin") - - -def scrape(debug): - url = "https://goslar.feripro.de/programm/42/anmeldung/veranstaltungen" - - if debug: - filename = "tmp/fp.html" - - if not path.exists(filename): - response = urlopen(url) - with open(filename, "wb") as text_file: - text_file.write(response.read()) - - doc = BeautifulSoup(open(filename), "html.parser") - else: - response = urlopen(url) - doc = BeautifulSoup(response, "html.parser") - - js_assigns_regex = r"(\w*)\s*:\s*JSON\.parse\('(.*)'\)" - js_assigns = dict() - javascripts = doc.find_all("script") - for javascript in javascripts: - javascript_contents = ( - javascript.contents[0] if len(javascript.contents) > 0 else "" - ) - - if "window.fp_initial" in javascript_contents: - matches = re.findall(js_assigns_regex, javascript_contents, re.MULTILINE) - for match in matches: - key = match[0] - - if key not in ["events"]: - continue - - json_str = match[1] - decoded_json_str = ( - json_str.encode("utf-8") - .decode("unicode_escape") - .encode("latin-1") - .decode("utf-8") - ) - value = json.loads(decoded_json_str, strict=False) - js_assigns[key] = value - break - - admin_unit = get_admin_unit_by_name("Ferienpass Goslar") - category = upsert_event_category("Other") - - for js_event in js_assigns["events"]: - if "event_id" not in js_event: - continue - - event_id = js_event["event_id"] - if not event_id: - continue - - try: - external_link = url + "#" + str(event_id) - event = Event.query.filter(Event.external_link == external_link).first() - did_create = False - - # Event - if event is None: - - if js_event["name"] in [ - "Entfällt", - "Diese Veranstaltung muss leider ausfallen ...", - ]: - continue - - event = Event() - event.admin_unit = admin_unit - did_create = True - - event.category = category - event.external_link = external_link - event.review_status = EventReviewStatus.verified - event.rating = 5 - event.target_group_origin = EventTargetGroupOrigin.resident - event.name = js_event["name"] - event.description = js_event["description"] - start = parse_date_time_str(js_event["start"]) - end = parse_date_time_str(js_event["end"]) - update_event_dates_with_recurrence_rule(event, start, end) - - # Organizer - js_organizer = js_event["organizer"] - organizer_name = ( - js_event["name_public"] - if js_event["name_public"] - else js_organizer["name"] - ) - organizer_phone = ( - js_event["phone_public"] - if js_event["phone_public"] - else js_organizer["phone"] - ) - organizer_email = ( - js_event["email_public"] - if js_event["email_public"] - else js_organizer["email"] - ) - organizer_url = ( - js_organizer["website"] - if js_organizer["website"] - else js_organizer["facebook"] - ) - - organizer = EventOrganizer.query.filter( - and_( - EventOrganizer.admin_unit_id == admin_unit.id, - EventOrganizer.name == organizer_name, - ) - ).first() - - if organizer is None: - organizer = EventOrganizer( - admin_unit_id=admin_unit.id, name=organizer_name - ) - - organizer.phone = organizer_phone - organizer.email = organizer_email - organizer.url = organizer_url - event.organizer = organizer - - # Place - place_name = "" - place_description = "" - place_location = None - - meeting_point = js_event["meeting_point"].replace("\r\n", ", ") - if len(meeting_point) > 80: - place_name = meeting_point[:80] + "..." - place_description = meeting_point - else: - place_name = meeting_point - - if ( - "meeting_point_latitude" in js_event - and "meeting_point_longitude" in js_event - ): - meeting_point_latitude = js_event["meeting_point_latitude"] - meeting_point_longitude = js_event["meeting_point_longitude"] - if meeting_point_latitude and meeting_point_longitude: - latitude = decimal.Decimal(meeting_point_latitude) - longitude = decimal.Decimal(meeting_point_longitude) - if latitude != 0 and longitude != 0: - place_location = Location() - place_location.latitude = latitude - place_location.longitude = longitude - - place = EventPlace.query.filter( - and_( - EventPlace.admin_unit_id == admin_unit.id, - EventPlace.organizer_id == organizer.id, - EventPlace.name == place_name, - ) - ).first() - - if place is None: - place = EventPlace( - admin_unit_id=admin_unit.id, - organizer_id=organizer.id, - name=place_name, - ) - - place.description = place_description - place.location = place_location - event.event_place = place - - # Additional data - event.status = ( - EventStatus.cancelled if js_event["canceled"] else EventStatus.scheduled - ) - event.kid_friendly = True - event.accessible_for_free = js_event["price"] == "0.00" - - tag_list = js_event["tags"] - tag_list.append("Ferienpass") - event.tags = ",".join(tag_list) - - if js_event["min_age"]: - event.age_from = int(js_event["min_age"]) - - if js_event["max_age"]: - event.age_to = int(js_event["max_age"]) - - print( - "%s %s %s %s" - % (event.dates[0].start, event.name, organizer.id, organizer.name) - ) - if did_create: - db.session.add(event) - - db.session.commit() - except Exception: - print("Exception") - pprint(js_event) - - -def parse_date_time_str(date_time_str): - if not date_time_str: - return None - - date_time = datetime.datetime.fromisoformat(date_time_str) - return berlin_tz.localize(date_time) - - -if __name__ == "__main__": - scrape(False) diff --git a/project/scrape/scrape_hi.py b/project/scrape/scrape_hi.py deleted file mode 100644 index dc95b61..0000000 --- a/project/scrape/scrape_hi.py +++ /dev/null @@ -1,218 +0,0 @@ -from project import db -from pprint import pprint -import datetime -from urllib import request -from os import path -import json -import decimal -from project.models import ( - EventReviewStatus, - Location, - Event, - EventStatus, - EventPlace, - EventOrganizer, -) -from sqlalchemy import and_, not_ -from project.services.admin_unit import get_admin_unit_by_name -from project.services.event import ( - upsert_event_category, - update_event_dates_with_recurrence_rule, -) - -admin_unit = get_admin_unit_by_name("Harzinfo") -category = upsert_event_category("Other") -base_url = "https://www.harzinfo.de" -url = base_url + "/?ndssearch=fullsearch&no_cache=1&L=0" - -with open("scrape_hi_req.json") as json_file: - request_object = json.load(json_file) - -with open("scrape_hi_cities.json") as json_file: - cities = json.load(json_file) - - -def response_from_url(city): - body = request_object - body["searchFilter"]["ndsdestinationdataevent"]["city"] = { - str(city["id"]): city["short_name"] or city["title"] - } - req = request.Request(url, data=bytes(json.dumps(body), encoding="utf-8")) - req.add_header("Content-Type", "application/json") - return request.urlopen(req) - - -def load_json(debug, city): - if debug: - filename = "tmp/hi_%d.html" % (city["id"]) - - if not path.exists(filename): - response = response_from_url(city) - with open(filename, "wb") as text_file: - text_file.write(response.read()) - - with open(filename) as json_file: - return json.load(json_file) - else: - response = response_from_url(city) - return json.load(response) - - -def parse_date_time_str(date_time_str): - if not date_time_str: - return None - - return datetime.datetime.fromisoformat(date_time_str + ":00") - - -def scrape(debug, city): - - # Organizer - organizer_name = city["short_name"] or city["title"] - organizer = EventOrganizer.query.filter( - and_( - EventOrganizer.admin_unit_id == admin_unit.id, - EventOrganizer.name == organizer_name, - ) - ).first() - - if organizer is None: - organizer = EventOrganizer(admin_unit_id=admin_unit.id, name=organizer_name) - db.session.add(organizer) - db.session.commit() - - print(organizer_name) - response = load_json(debug, city) - result = response["result"] - event_ids = list() - - for item in result: - try: - uid = str(item["uid"]) - external_link = base_url + item["link"] + "#" + uid - event = Event.query.filter( - and_( - Event.organizer_id == organizer.id, - Event.external_link == external_link, - ) - ).first() - did_create = False - - if event is None: - event = Event() - event.admin_unit = admin_unit - event.organizer = organizer - did_create = True - - event_ids.append(event.id) - - event.category = category - event.external_link = external_link - event.review_status = EventReviewStatus.verified - event.rating = int(item["rating"]) - event.name = item["title"] - event.description = item["title"] - - start = parse_date_time_str(item["date"]) - update_event_dates_with_recurrence_rule(event, start, None) - - # Place - place_name = item["location"] - place_description = "" - place_location = None - - if "latitude" in item and "longitude" in item: - meeting_point_latitude = item["latitude"] - meeting_point_longitude = item["longitude"] - if meeting_point_latitude and meeting_point_longitude: - latitude = decimal.Decimal(meeting_point_latitude) - longitude = decimal.Decimal(meeting_point_longitude) - if latitude != 0 and longitude != 0: - place_location = Location() - place_location.latitude = latitude - place_location.longitude = longitude - - place = EventPlace.query.filter( - and_( - EventPlace.admin_unit_id == admin_unit.id, - EventPlace.organizer_id == organizer.id, - EventPlace.name == place_name, - ) - ).first() - - if place is None: - place = EventPlace( - admin_unit_id=admin_unit.id, - organizer_id=organizer.id, - name=place_name, - ) - - place.description = place_description - place.location = place_location - event.event_place = place - - # Additional data - event.status = ( - EventStatus.cancelled if item["canceled"] else EventStatus.scheduled - ) - - if "categories" in item: - tag_list = list(item["categories"].values()) - - if "Ausstellung/Kunst" in tag_list: - event.category = upsert_event_category("Art") - elif "Comedy" in tag_list: - event.category = upsert_event_category("Comedy") - elif "Konzert/Musik" in tag_list: - event.category = upsert_event_category("Music") - elif "Theater" in tag_list: - event.category = upsert_event_category("Theater") - elif "Genuss/Gourmet" in tag_list: - event.category = upsert_event_category("Dining") - elif "Gesundheit/Wellness" in tag_list: - event.category = upsert_event_category("Fitness") - elif "Kinder/Jugend" in tag_list: - event.category = upsert_event_category("Family") - elif "Markt/Flohmarkt" in tag_list: - event.category = upsert_event_category("Shopping") - elif "Sport" in tag_list: - event.category = upsert_event_category("Sports") - elif "Vortrag/Lesung" in tag_list: - event.category = upsert_event_category("Book") - elif "Kabarett" in tag_list: - event.category = upsert_event_category("Art") - elif "Musical" in tag_list: - event.category = upsert_event_category("Theater") - elif "Weihnachtsmärkte" in tag_list: - event.category = upsert_event_category("Festival") - elif "Stadt- und Volksfeste" in tag_list: - event.category = upsert_event_category("Festival") - - if "Kinder/Jugend" in tag_list: - event.kid_friendly = True - - tag_list.append("Harzinfo") - event.tags = ",".join(tag_list) - - print("%s %s %d" % (event.dates[0].start, event.name, event.rating)) - if did_create: - db.session.add(event) - - db.session.commit() - except Exception: - print("Exception") - pprint(item) - - Event.query.filter( - and_( - Event.admin_unit_id == admin_unit.id, - Event.organizer_id == organizer.id, - not_(Event.id.in_(event_ids)), - ) - ).delete(synchronize_session="fetch") - db.session.commit() - - -if __name__ == "__main__": - for city in cities.values(): - scrape(False, city) diff --git a/project/scrape/scrape_hi_cities.json b/project/scrape/scrape_hi_cities.json deleted file mode 100644 index 0628ecb..0000000 --- a/project/scrape/scrape_hi_cities.json +++ /dev/null @@ -1,206 +0,0 @@ -{ - "10": { - "district": null, - "id": 10, - "short_name": "", - "title": "Blankenburg" - }, - "105": { - "district": null, - "id": 105, - "short_name": "Tilleda", - "title": "Kelbra OT Tilleda" - }, - "111": { - "district": null, - "id": 111, - "short_name": "", - "title": "Lutherstadt Eisleben" - }, - "12": { - "district": null, - "id": 12, - "short_name": "", - "title": "Braunlage" - }, - "14": { - "district": null, - "id": 14, - "short_name": "", - "title": "Clausthal-Zellerfeld" - }, - "148": { - "district": null, - "id": 148, - "short_name": "", - "title": "Leinefelde-Worbis" - }, - "150": { - "district": null, - "id": 150, - "short_name": "", - "title": "Abbenrode" - }, - "17": { - "district": null, - "id": 17, - "short_name": "", - "title": "Duderstadt" - }, - "18": { - "district": null, - "id": 18, - "short_name": "", - "title": "Goslar" - }, - "2": { - "district": null, - "id": 2, - "short_name": "Torfhaus", - "title": "Altenau OT Torfhaus" - }, - "20": { - "district": null, - "id": 20, - "short_name": "Vienenburg", - "title": "Goslar OT Vienenburg" - }, - "21": { - "district": null, - "id": 21, - "short_name": "", - "title": "Goslar- Hahnenklee" - }, - "22": { - "district": null, - "id": 22, - "short_name": "", - "title": "Halberstadt" - }, - "23": { - "district": null, - "id": 23, - "short_name": "", - "title": "Harzgerode" - }, - "27": { - "district": null, - "id": 27, - "short_name": "", - "title": "Herzberg am Harz" - }, - "28": { - "district": null, - "id": 28, - "short_name": "", - "title": "Ilsenburg" - }, - "29": { - "district": null, - "id": 29, - "short_name": "Drübeck", - "title": "Ilsenburg OT Drübeck" - }, - "35": { - "district": null, - "id": 35, - "short_name": "Wolfshagen", - "title": "Langelsheim OT Wolfshagen" - }, - "37": { - "district": null, - "id": 37, - "short_name": "", - "title": "Nordhausen" - }, - "4": { - "district": null, - "id": 4, - "short_name": "", - "title": "Aschersleben" - }, - "42": { - "district": null, - "id": 42, - "short_name": "Höhlenort Rübeland", - "title": "Oberharz am Brocken OT Höhlenort Rübeland" - }, - "44": { - "district": null, - "id": 44, - "short_name": "Stiege", - "title": "Oberharz am Brocken OT Stiege" - }, - "47": { - "district": null, - "id": 47, - "short_name": "", - "title": "Osterode am Harz" - }, - "5": { - "district": null, - "id": 5, - "short_name": "Bad Gandersheim", - "title": "Bad Gandersheim" - }, - "50": { - "district": null, - "id": 50, - "short_name": "", - "title": "Quedlinburg" - }, - "6": { - "district": null, - "id": 6, - "short_name": "", - "title": "Bad Grund (Harz)" - }, - "60": { - "district": null, - "id": 60, - "short_name": "", - "title": "Thale" - }, - "63": { - "district": null, - "id": 63, - "short_name": "", - "title": "Walkenried" - }, - "65": { - "district": null, - "id": 65, - "short_name": "", - "title": "Wernigerode" - }, - "67": { - "district": null, - "id": 67, - "short_name": "Schierke", - "title": "Wernigerode OT Schierke" - }, - "7": { - "district": null, - "id": 7, - "short_name": "Bad Harzburg", - "title": "Bad Harzburg" - }, - "8": { - "district": null, - "id": 8, - "short_name": "", - "title": "Bad Lauterberg im Harz" - }, - "82": { - "district": null, - "id": 82, - "short_name": "", - "title": "Ballenstedt" - }, - "9": { - "district": null, - "id": 9, - "short_name": "", - "title": "Bad Sachsa" - } -} \ No newline at end of file diff --git a/project/scrape/scrape_hi_req.json b/project/scrape/scrape_hi_req.json deleted file mode 100644 index a687717..0000000 --- a/project/scrape/scrape_hi_req.json +++ /dev/null @@ -1,142 +0,0 @@ -{ - "contentId": 5801, - "languageId": 0, - "languageIsocode": "de", - "pageId": 637, - "searchFilter": { - "additionalSearchOptions": { - "keepFiltersForNewSearch": "0", - "showMainCategoriesInFilter": "0" - }, - "ndsdestinationdataevent": { - "bookableFilter": false, - "category": {}, - "city": { - "7": "Bad Harzburg" - }, - "cityMode": "fix", - "district": [], - "endDate": "2021-07-22", - "formOptions": { - "categories": { - "24": { - "id": 24, - "title": "Ausstellung/Kunst" - }, - "26": { - "id": 26, - "title": "Bergbau/Industriekultur" - }, - "27": { - "id": 27, - "title": "Brauchtum" - }, - "28": { - "id": 28, - "title": "Führung/Besichtigung" - }, - "29": { - "id": 29, - "title": "Genuss/Gourmet" - }, - "30": { - "id": 30, - "title": "Gesundheit/Wellness" - }, - "31": { - "id": 31, - "title": "Harzer KlosterSommer" - }, - "33": { - "id": 33, - "title": "Kinder/Jugend" - }, - "34": { - "id": 34, - "title": "Konzert/Musik" - }, - "35": { - "id": 35, - "title": "Luther" - }, - "36": { - "id": 36, - "title": "Markt/Flohmarkt" - }, - "40": { - "id": 40, - "title": "Silvester" - }, - "41": { - "id": 41, - "title": "Sonstiges" - }, - "42": { - "id": 42, - "title": "Sport" - }, - "43": { - "id": 43, - "title": "Stadt- und Volksfeste" - }, - "45": { - "id": 45, - "title": "Theater" - }, - "46": { - "id": 46, - "title": "TheaterNatur-Festival" - }, - "47": { - "id": 47, - "title": "Vortrag/Lesung" - }, - "49": { - "id": 49, - "title": "Wanderung" - }, - "50": { - "id": 50, - "title": "Weihnachtsmärkte" - }, - "51": { - "id": 51, - "title": "Comedy" - }, - "52": { - "id": 52, - "title": "Kabarett" - }, - "53": { - "id": 53, - "title": "Musical" - }, - "54": { - "id": 54, - "title": "Jubiläum" - }, - "55": { - "id": 55, - "title": "Weihnachten" - }, - "56": { - "id": 56, - "title": "Advent" - } - } - }, - "geolocation": { - "distance": "20000", - "latitude": 0, - "longitude": 0 - }, - "searchWithoutDate": true, - "searchWithoutDateBackupEnd": "2020-09-20", - "searchWithoutDateBackupStart": "2020-07-22", - "startDate": "2020-07-22" - } - }, - "sources": [ - "ndsdestinationdataevent" - ] -} \ No newline at end of file diff --git a/project/scrape/form.py b/tests/form.py similarity index 100% rename from project/scrape/form.py rename to tests/form.py diff --git a/tests/utils.py b/tests/utils.py index a583ec6..274b11f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -76,7 +76,7 @@ class UtilActions(object): ) def create_form_data(self, response, values: dict) -> dict: - from project.scrape.form import Form + from tests.form import Form soup = BeautifulSoup(response.data, "html.parser") form = Form(soup.find("form"))