Finding Broken Websites in OSM Data

Posted by GrumpyGorilla on 5/18/2024

Recently while updating business information in my area, I noticed that outdated POIs would often include a defunct website. When attempting to follow the links in OSM, I was just presented with an error. That gave me the idea that I could proactively identify POIs that may need validated in person.

After some tinkering, I came up with a python script that will query OSM data for nodes with websites, then iterate over the websites to see which return errors when connecting. I started with logging these to a text file, but then realized that I could export them all to a kmz file and import them into Organic Maps. Now when I’m out and about I can launch Organic Maps to find areas that I should validate.

Screenshot of Organic Maps with pins through Amsterdam

When you select a pin, it’ll give you more details on the failure.

Screenshot of Organic Maps with one pin selected. It display the website, http error, and OSM link

While testing I noticed that there are also many sites getting errors indicating that the specific page isn’t found or that the page can be found at a new location. I believe that these can be validated without surveying, so I set up a separate mode called “armchair” mode that highlight these errors.

Setup

NOTE: This will initiate connections from your machine to whatever websites exist in OSM

To use the script, copy the below file and save it as “broken-websites.py”. Before running I believe you’ll need to install the required packages by running the below commands in your terminal:

1 2 3 pip3 install simplekml pip3 install overpy pip3 install geopy

From there you can change that latitude, longitude, and radius. Keep in mind that a larger radius takes exponentially longer. You can commands like:

1 2 3 4 5 6 7 8 # List all websites that return a 5XX error to a text file python3 ./broken-websites.py survey txt # Generate a kmz of all websites that redirect or return 404 python3 ./broken-websites.py airchair kmz # Generate both list types to both file types python3 ./broken-websites.py both both

Script:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 import overpy from geopy.distance import geodesic import requests import datetime import simplekml import sys latitude = 52.377956 longitude = 4.897070 radius = .5 # Default radius in kilometers def check_website(url): try: response = requests.head(url, timeout=10) return response.status_code, response.reason except Exception as e: return None, str(e) def get_amenities(): api = overpy.Overpass() # Define bounding box lat_min = latitude - (radius * 0.009) lat_max = latitude + (radius * 0.009) lon_min = longitude - (radius * 0.009) lon_max = longitude + (radius * 0.009) query_str = f""" node["website"] ({lat_min},{lon_min},{lat_max},{lon_max}); out; """ result = api.query(query_str) amenities = [] for node in result.nodes: name = node.tags.get("name", "Unknown") amenity_type = node.tags.get("amenity", "Unknown") lat = float(node.lat) lon = float(node.lon) website = node.tags.get("website", "N/A") distance = geodesic((latitude, longitude), (lat, lon)).kilometers osm_link = f"/node/{node.id}" if distance <= radius: amenities.append({"name": name, "amenity_type": amenity_type, "latitude": lat, "longitude": lon, "website": website, "distance": distance, "osm_link": osm_link}) return amenities def save_to_kmz(amenities_with_broken_websites, mode, timestamp): kml = simplekml.Kml() icon_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ec/Red_dot.svg' # Privacy-friendly icon URL for amenity in amenities_with_broken_websites: description = (f"<p>Website: <a href='{amenity['website']}' target='_blank'>{amenity['website']}</a></p>" f"<p>{amenity['error_message']}</p>" f"<p><a href='{amenity['osm_link']}' target='_blank'>Link to OSM</a></p>") placemark = kml.newpoint(name=amenity['name'], description=description, coords=[(amenity['longitude'], amenity['latitude'])]) placemark.style.iconstyle.icon.href = icon_url file_name = (f"Broken_Websites_{mode}_Radius_{radius}km_Lat_{latitude}_Long_{longitude}_{timestamp}.kmz") kml.savekmz(file_name) print(f"KMZ file saved as: {file_name}") def save_to_txt(amenities_with_broken_websites, mode, timestamp): file_name = (f"Broken_Websites_{mode}_Radius_{radius}km_Lat_{latitude}_Long_{longitude}_{timestamp}.txt") with open(file_name, 'w') as f: for amenity in amenities_with_broken_websites: f.write(f"Name: {amenity['name']}\n") f.write(f"Amenity Type: {amenity['amenity_type']}\n") f.write(f"Latitude: {amenity['latitude']}\n") f.write(f"Longitude: {amenity['longitude']}\n") f.write(f"Website: {amenity['website']}\n") f.write(f"Distance: {amenity['distance']:.2f} km\n") f.write(f"Error: {amenity['error_message']}\n") f.write(f"OpenStreetMap: {amenity['osm_link']}\n") f.write("\n") print(f"Text file saved as: {file_name}") def filter_amenities(amenities): filtered_amenities = {"survey": [], "armchair": []} for amenity in amenities: status_code, error_message = check_website(amenity['website']) if not status_code or 500 <= status_code < 600: amenity['error_message'] = f"Status Code: {status_code}, Error: {error_message}" filtered_amenities["survey"].append(amenity) if status_code and (300 <= status_code < 400 or status_code == 404): amenity['error_message'] = f"Status Code: {status_code}, Error: {error_message}" filtered_amenities["armchair"].append(amenity) return filtered_amenities def main(): if len(sys.argv) != 3: print("Usage: python script.py <mode> <output>") print("Mode should be one of: survey, armchair, both") print("Output should be one of: txt, kmz, both") return mode = sys.argv[1] output = sys.argv[2] if mode not in ["survey", "armchair", "both"]: print("Invalid mode. Mode should be one of: survey, armchair, both") return if output not in ["txt", "kmz", "both"]: print("Invalid output. Output should be one of: txt, kmz, both") return print(f"Searching for amenities with websites within {radius} km...") amenities = get_amenities() if amenities: print("\nChecking websites...") filtered_amenities = filter_amenities(amenities) timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if mode in ["survey", "both"]: if filtered_amenities["survey"]: # print(f"Found {len(filtered_amenities["survey"])} broken websites") if output in ["kmz", "both"]: save_to_kmz(filtered_amenities["survey"], "survey", timestamp) if output in ["txt", "both"]: save_to_txt(filtered_amenities["survey"], "survey", timestamp) else: print("No amenities found with broken websites in survey mode.") if mode in ["armchair", "both"]: if filtered_amenities["armchair"]: # print(f"Found {len(filtered_amenities["armchair"])} broken websites") if output in ["kmz", "both"]: save_to_kmz(filtered_amenities["armchair"], "armchair", timestamp) if output in ["txt", "both"]: save_to_txt(filtered_amenities["armchair"], "armchair", timestamp) else: print("No amenities found with broken websites in armchair mode.") else: print("No amenities found within the specified radius or with specified website.") if __name__ == "__main__": main()