From 664c7c46a00a3e41ca99389514e318eac6edb9c2 Mon Sep 17 00:00:00 2001 From: jude Date: Mon, 26 Feb 2024 13:02:43 +0000 Subject: [PATCH] Moved to single file --- .gitignore | 1 + fetch_data.py | 69 -------------------- gpx.py | 161 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + to_gpx.py | 31 --------- 5 files changed, 163 insertions(+), 100 deletions(-) delete mode 100644 fetch_data.py create mode 100644 gpx.py delete mode 100644 to_gpx.py diff --git a/.gitignore b/.gitignore index 2518f65..f4b2f54 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ data.json *.osm *.gpx +cache/ diff --git a/fetch_data.py b/fetch_data.py deleted file mode 100644 index cb85128..0000000 --- a/fetch_data.py +++ /dev/null @@ -1,69 +0,0 @@ -import re -from io import StringIO -import csv -import json - -import requests - - -def get_megalithic_data(country=1): - def megalithic_url(country): - return 'https://www.megalithic.co.uk/cache/csvmap_country{}.csv'.format(country) - - # Megalithic doesn't really want people scraping - response = requests.get( - megalithic_url(country), - headers={ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' - } - ) - if response.status_code != 200: - raise Exception('Failed to fetch data from Megalithic.UK: {}'.format(response.text)) - - content = StringIO(response.text) - reader = csv.DictReader(content, delimiter='|') - - data = [] - for row in reader: - data.append({ - 'lat': row['lat'], - 'lng': row['lng'], - 'name': row['Name'], - 'type': row['Type'], - 'url': 'https://megalithic.co.uk/article.php?sid={}'.format(row['SID']), - }) - - return data - - -def get_stone_circles_data(): - response = requests.get('http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js') - - if response.status_code != 200: - raise Exception('Failed to fetch data from stone-circles.org.uk: {}'.format(response.text)) - - content = re.match( - r'.+ = \[(\[.+]),?];', - response.text.replace('\n', '') - ) - content = re.sub(r'\\(?!")', '', content.groups()[0]) - arr = json.loads('[{}]'.format(content)) - - data = [] - for item in arr: - data.append({ - 'lat': item[0], - 'lng': item[1], - 'name': re.sub(r'<.+?>', '', re.match(r'(.+)', item[2]).groups()[0]), - 'type': re.sub(r'.+>', '', item[2].replace('
', ' ')), - 'url': 'http://www.stone-circles.org.uk/stone/{}'.format(re.search(r'href=([a-zA-Z.]+)', item[2]).groups()[0]), - }) - - return data - - -if __name__ == '__main__': - all_data = get_stone_circles_data() # + get_megalithic_data() - - with open('data.json', 'w') as f: - json.dump(all_data, f) diff --git a/gpx.py b/gpx.py new file mode 100644 index 0000000..c90b988 --- /dev/null +++ b/gpx.py @@ -0,0 +1,161 @@ +import csv +import json +import argparse +from html import escape +from pathlib import Path +import re + +import requests + +parser = argparse.ArgumentParser( + prog="megalithosm", + description="Fetch data from megalith sources and produce a GPX file", +) + +parser.add_argument( + "-q", + "--quality", + default=5, + type=int, + help="Include sites of this quality or higher", +) +parser.add_argument("-r", "--refetch", action="store_true", default=False) + +args = parser.parse_args() + +country_range = 6 + + +def cache_stone_circles(): + response = requests.get( + "http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js" + ) + if response.status_code != 200: + raise Exception( + "Failed to fetch data from stone-circles.org.uk: {}".format(response.text) + ) + + Path("cache").mkdir(exist_ok=True) + with open("cache/stone-circles.js", "w") as f: + f.write(response.text) + + +def cache_megalithic(): + def megalithic_url(country): + return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country) + + for country in range(1, country_range): + # Megalithic doesn't really want people scraping + response = requests.get( + megalithic_url(country), + headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0" + }, + ) + if response.status_code != 200: + raise Exception( + "Failed to fetch data from Megalithic.UK: {}".format(response.text) + ) + + Path("cache").mkdir(exist_ok=True) + with open("cache/megalithic-{}.csv".format(country), "w") as f: + f.write(response.text) + + +if args.refetch or not Path("cache/stone-circles.js").exists(): + print("Refreshing stone-circles.org.uk cache") + cache_stone_circles() + +if args.refetch or not all( + Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range) +): + print("Refreshing megalithic.co.uk cache") + cache_megalithic() + + +print("Post-processing data") +data = [] +for country in range(1, country_range): + with open("cache/megalithic-{}.csv".format(country)) as f: + reader = csv.DictReader(f, delimiter="|") + types = set() + for row in reader: + types.add(row["Type"]) + data.append( + { + "lat": row["lat"], + "lng": row["lng"], + "name": row["Name"], + "type": row["Type"], + "url": "https://megalithic.co.uk/article.php?sid={}".format( + row["SID"] + ), + "quality": int(row["Condition"]), + } + ) + + with open("cache/stone-circles.js") as f: + content = f.read() + content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", "")) + content = re.sub(r'\\(?!")', "", content.groups()[0]) + arr = json.loads("[{}]".format(content)) + for item in arr: + data.append( + { + "lat": item[0], + "lng": item[1], + "name": re.sub( + r"<.+?>", "", re.match(r"(.+)", item[2]).groups()[0] + ), + "type": re.sub(r".+>", "", item[2].replace("
", " ")), + "url": "http://www.stone-circles.org.uk/stone/{}".format( + re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0] + ), + "quality": 5, + } + ) + + +print("Generating GPX") +with open("Megaliths.gpx", "w") as gpx_file: + gpx_file.write( + """ + + + Megalith sites + + Jude Southworth + + """ + ) + + seen_sites = set() + for poi in data: + norm_name = re.sub(r"\s", "", poi["name"]).lower() + site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name) + if site_key in seen_sites: + print("Omitting duplicate site: {}".format(poi["name"])) + continue + if poi["quality"] < args.quality: + continue + if poi["type"] in ["Museum", "Modern Stone Circle etc"]: + print("Omitting uninteresting feature: {}".format(poi["name"])) + continue + + # Deduplicate entries + seen_sites.add(site_key) + name = "{} ({})".format(poi["name"], poi["type"].strip()) + gpx_file.write( + """ + + {} + {} + """.format( + poi["lat"], poi["lng"], escape(name), escape(poi["url"]) + ) + ) + gpx_file.write("""\n""") diff --git a/requirements.txt b/requirements.txt index cc8680d..5bc81e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests~=2.31 pandas~=2.2 +black~=24.2 diff --git a/to_gpx.py b/to_gpx.py deleted file mode 100644 index 82ae066..0000000 --- a/to_gpx.py +++ /dev/null @@ -1,31 +0,0 @@ -import json -from html import escape - -with open('data.json') as data_file: - data = json.load(data_file) - -with open('Megaliths.gpx', 'w') as gpx_file: - gpx_file.write(''' - - - Megalith sites - - Jude Southworth - - ''') - - for poi in data: - name = '{} ({})'.format(poi['name'], poi['type'].strip()) - - gpx_file.write( - ''' - - {} - {} - '''.format(poi['lat'], poi['lng'], escape(name), poi['url']) - ) - gpx_file.write('')