Notes/Offline analysis
Here are some snippets to work with the notes dump described at Notes#Download_notes.
Python
Either 1. run python3
and copy-paste these snippets, or 2. make an executable script by pasting the line #!/usr/bin/env python3
followed by these snippets.
From .OSN to Python generator
from __future__ import annotations
import dataclasses
from collections.abc import Iterator
from dataclasses import dataclass
from typing import Literal
from xml.etree import ElementTree
@dataclass(frozen=True)
class Note:
"""A note is a sequence of comments at given coordinates.
To determine whether a note is open or closed, check that `closed_at` is None.
You may be tempted to sort comments by timestamp and
look at the `action` of the comment with highest timestamp,
but note that
- some notes have a comment with action = 'commented' and one with
action = 'closed' at the same timestamp (instead of having a single
comment that both closes the note and provides the closing comment)
- some notes have a comment with action = 'closed' and a comment
with action = 'reopened' at the same timestamp (resolved and reactivated
within a second)
"""
id_: str
comments: tuple[Comment, ...]
lat: str
lon: str
created_at: str
closed_at: str | None
"""None means that it was missing in the source data."""
@dataclass(frozen=True)
class Comment:
"""A comment is something that happens on a note."""
note_id: str
action: Literal['opened', 'closed', 'reopened', 'commented']
text: str | None
"""None means that it was missing in the source data."""
timestamp: str
uid: str | None
"""None means that it was missing in the source data."""
user: str | None
"""None means that it was missing in the source data."""
def notes_from_elementtree(f) -> Iterator[Note]:
"""Return notes from ElementTree parsing.
:param f:
File-like object.
"""
parser = ElementTree.iterparse(f, events=['end'])
node: ElementTree.Element
for _, node in parser:
if node.tag == 'note':
yield Note(
id_=node.attrib['id'],
lat=node.attrib['lat'],
lon=node.attrib['lon'],
created_at=node.attrib['created_at'],
closed_at=node.get('closed_at', None),
comments=tuple(
Comment(
note_id=node.attrib['id'],
action=comment_node.attrib['action'],
text=comment_node.text,
timestamp=comment_node.attrib['timestamp'],
uid=comment_node.attrib.get('uid', None),
user=comment_node.attrib.get('user', None),
)
for comment_node in node.findall('comment')
),
)
# Per <stackoverflow.com/a/9814580>, clear the node,
# so the RAM is not filled with unused notes.
# This is necessary, because the XML file is too big
# to keep the parsed contents in memory all at once.
# Note that this is under `if node.tag == 'note'`
# so that it does not clear the `<comment>` tags
# before the `end` event of their containing `<note>`
# is hit.
node.clear()
From Python generator to JSON
1. Copy the code from #From .OSN to Python generator.
2. Adapt and paste this
JSON_PATH = '/path/to/output.json'
3. If you are working with an *.osn file, then
OSN_PATH = '/path/to/notes.osn' # <- Change this.
osn_reader = pathlib.Path(OSN_PATH).open(encoding='utf-8')
Otherwise, if you are working with a compressed *.osn.bz2 file, then
OSN_BZ2_PATH = '/path/to/notes.osn.bz2' # <- Change this.
import bz2
osn_reader = bz2.open(OSN_BZ2_PATH, mode='rt', encoding='utf-8')
4. Paste this.
import json
import pathlib
def in_bbox(lat: float, lon: float, w: float, s: float, e: float, n: float) -> bool:
return s < lat < n and w < lon < e
def in_italy(lat: float, lon: float) -> bool:
# Example taken from [[Italy#Bounding boxes]].
# How to determine the coordinates of a bounding box using QGis:
# - Layer - Create Layer - New Temporary Scratch Layer - set Geomtry type to Polygon - click Ok
# - in the toolbar, make sure that `Toggle Editing` is enabled
# - in the toolbar, enable `Add Polygon Feature`
# - draw polygons
# - in the toolbar, enable `Select Features by Area or Single Click`
# - select a polygon, by clicking on it
# - paste this in the python console and press Enter
# iface.activeLayer().selectedFeatures()[0].geometry().boundingBox().toString()
return (
# Italia.
(
in_bbox(lat, lon, w=7.48, s=38.35, e=13.90, n=46.66)
or in_bbox(lat, lon, w=6.58, s=43.98, e=7.70, n=46.12)
or in_bbox(lat, lon, w=12.79, s=41.85, e=14.78, n=43.20)
or in_bbox(lat, lon, w=11.87, s=36.28, e=18.65, n=42.26)
or in_bbox(lat, lon, w=10.26, s=46.45, e=12.86, n=47.20)
)
# Corsica.
and not in_bbox(lat, lon, w=9.7, s=41.34, e=8, n=43)
)
def print_completion_status(current_note_id: str) -> None:
total_notes = 4700000
if int(current_note_id) % 20000 == 0:
print(
f"({100 * int(current_note_id) / total_notes:.0f}%)"
f" Analyzed {current_note_id} of the planet's ~{total_notes} notes."
)
with pathlib.Path(JSON_PATH).open('w', encoding='utf-8') as json_output:
_ = json_output.write('[')
is_first_note = True
with osn_reader as xml_input:
for note in notes_from_elementtree(xml_input):
print_completion_status(note.id_)
if not in_italy(float(note.lat), float(note.lon)):
continue
if note.closed_at is not None:
# `continue` means to skip the processing of this note
# and continue with the next note.
# If you only want to process the open notes, use `continue` here;
# otherwise, replace `continue` with `pass`.
continue
if is_first_note:
is_first_note = False
else:
_ = json_output.write(',')
_ = json_output.write(json.dumps(dataclasses.asdict(note)))
_ = json_output.write(']')
From JSON to CSV
1. Follow #From Python generator to JSON.
2. Adapt and paste this
CSV_PATH = '/path/to/output.csv'
JSON_PATH = '/path/to/input.json'
3. Paste this.
import csv
import json
import pathlib
notes = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))
with pathlib.Path(CSV_PATH).open('w', encoding='utf-8') as csv_output:
# More options to customize the CSV are at
# https://docs.python.org/3/library/csv.html#csv.writer
# https://docs.python.org/3/library/csv.html#writer-objects
writer = csv.writer(csv_output)
for n in notes:
first_comment = n['comments'][0]['text']
author = n['comments'][0]['user'] or 'anonymous'
_ = writer.writerow([
n['id_'],
n['lat'],
n['lon'],
first_comment,
n['created_at'],
author,
])
Note that Microsoft Excel's default CSV import wizard has a well-known issue with newline characters and will not import correctly. To work around it, do any of these.
- Per the Microsoft help, double-click the file in the file manager window (also known as Explorer), before launching Excel. Do not use the file open wizard from the
File
menu. - Import with the
Power Query
tool. Details here and here.Power Query
is under the menuData
and is documented here and here. - Use LibreOffice, visidata, or another standards-compliant spreadsheet.
- Replace
first_comment
withfirst_comment.replace('\r\n', ' · ').replace('\r', ' · ').replace('\n', ' · ')
Search the JSON
1. Follow #From Python generator to JSON. 2. Adapt and paste this
JSON_PATH = '/path/to/input.json'
3. Paste this.
import json
import pathlib
notes = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))
for n in notes:
first_comment = n['comments'][0]['text']
# `in` does a case-sensitive search.
# To do a case-insensitive search, use a lowercase query
# and add `.lower()` to the variable to be searched.
if (
'bank' in first_comment.lower()
and 'streetcomplete' not in first_comment.lower()
):
print(f"osm.org/note/{n['id_']}")
print(first_comment)
print('\n\n')
Find notes that are close to each other
Produce a list of notes that are very close to each other; they are potential duplicates.
Python
1. Follow #From Python generator to JSON.
2. Adapt and paste this
JSON_PATH = '/path/to/input.json'
3. Paste this.
import json
import pathlib
from collections import defaultdict
from collections.abc import Iterator
Note = dict[str, str]
notes: list[Note] = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))
tiles: dict[float, list[Note]] = defaultdict(list)
"""
Optional optimization to split the area into tiles.
Tune the formula so that each tile contains at most a few hundred notes.
Without this optimization, every note is compared with every other (possibly distant) note.
30000 notes mean ~1 billion comparisons, which takes too much time on the average laptop.
"""
for n in notes:
tiles[int(float(n['lat']) * 20 + float(n['lon']) * 20)].append(n)
def distant_less_than(distance: float, n1: Note, n2: Note, /) -> bool:
"""Determine whether notes are distant from each other less than `distance`."""
return (
abs(float(n1['lat']) - float(n2['lat'])) < distance
and abs(float(n1['lon']) - float(n2['lon'])) < distance
)
def notes_close_to_each_other() -> Iterator[tuple[Note, Note]]:
"""Return the notes that are very close to each other."""
for t in tiles.values():
for n1 in t:
for n2 in t:
if n1['id_'] < n2['id_'] and distant_less_than(0.00002, n1, n2):
yield n1, n2
print("┌─────────┬─────────┬────────────┬────────────┐")
print("│ id │ id │ lat │ lon │")
print("├─────────┼─────────┼────────────┼────────────┤")
for n1, n2 in notes_close_to_each_other():
print(f"| {n1['id_']} | {n2['id_']} | {n1['lat']} | {n1['lon']} |")
print("└─────────┴─────────┴────────────┴────────────┘")
SQLite
If you have imported the notes in SQLite instead of Python, you can replicate the tiling technique, to reduce the number of comparisons by a few orders of magnitude and keep the query efficient on the average laptop, like this.
CREATE TEMP TABLE Tiles AS SELECT note_id, lat, lon, cast(lat * 20 AS INTEGER) + cast(lon * 20 AS INTEGER) AS tile FROM ...; -- Index for the following query. As always, this was suggested by `.expert`. -- Note that the order of the columns is important here, as SQLite's WHERE -- implementation only uses indices which have the WHERE's columns as their -- first columns. CREATE INDEX idx_tiles ON Tiles(tile, lat, lon, note_id); SELECT * FROM Tiles note1 JOIN Tiles note2 WHERE note1.tile = note2.tile AND ABS(note1.lat - note2.lat) < 0.00002 AND ABS(note1.lon - note2.lon) < 0.00002 AND note1.note_id < note2.note_id;