Notes/Offline analysis

Here are some snippets to work with the notes dump described at Notes#Download_notes.

Python

Either 1. run python3 and copy-paste these snippets, or 2. make an executable script by pasting the line #!/usr/bin/env python3 followed by these snippets.

From .OSN to Python generator

from __future__ import annotations


import dataclasses
from collections.abc import Iterator
from dataclasses import dataclass
from typing import Literal
from xml.etree import ElementTree


@dataclass(frozen=True)
class Note:
    """A note is a sequence of comments at given coordinates.

    To determine whether a note is open or closed, check that `closed_at` is None.
    You may be tempted to sort comments by timestamp and
    look at the `action` of the comment with highest timestamp,
    but note that

        - some notes have a comment with action = 'commented' and one with
          action = 'closed' at the same timestamp (instead of having a single
          comment that both closes the note and provides the closing comment)

        - some notes have a comment with action = 'closed' and a comment
          with action = 'reopened' at the same timestamp (resolved and reactivated
          within a second)
    """

    id_: str

    comments: tuple[Comment, ...]

    lat: str

    lon: str

    created_at: str

    closed_at: str | None
    """None means that it was missing in the source data."""


@dataclass(frozen=True)
class Comment:
    """A comment is something that happens on a note."""

    note_id: str

    action: Literal['opened', 'closed', 'reopened', 'commented']

    text: str | None
    """None means that it was missing in the source data."""

    timestamp: str

    uid: str | None
    """None means that it was missing in the source data."""

    user: str | None
    """None means that it was missing in the source data."""


def notes_from_elementtree(f) -> Iterator[Note]:
    """Return notes from ElementTree parsing.

    :param f:
        File-like object.
    """
    parser = ElementTree.iterparse(f, events=['end'])

    node: ElementTree.Element
    for _, node in parser:
        if node.tag == 'note':
            yield Note(
                id_=node.attrib['id'],
                lat=node.attrib['lat'],
                lon=node.attrib['lon'],
                created_at=node.attrib['created_at'],
                closed_at=node.get('closed_at', None),
                comments=tuple(
                    Comment(
                        note_id=node.attrib['id'],
                        action=comment_node.attrib['action'],
                        text=comment_node.text,
                        timestamp=comment_node.attrib['timestamp'],
                        uid=comment_node.attrib.get('uid', None),
                        user=comment_node.attrib.get('user', None),
                    )
                    for comment_node in node.findall('comment')
                ),
            )
            # Per <stackoverflow.com/a/9814580>, clear the node,
            # so the RAM is not filled with unused notes.
            # This is necessary, because the XML file is too big
            # to keep the parsed contents in memory all at once.
            # Note that this is under `if node.tag == 'note'`
            # so that it does not clear the `<comment>` tags
            # before the `end` event of their containing `<note>`
            # is hit.
            node.clear()

From Python generator to JSON

1. Copy the code from #From .OSN to Python generator.
2. Adapt and paste this

JSON_PATH = '/path/to/output.json'

3. If you are working with an *.osn file, then

OSN_PATH = '/path/to/notes.osn'  # <- Change this.
osn_reader = pathlib.Path(OSN_PATH).open(encoding='utf-8')

Otherwise, if you are working with a compressed *.osn.bz2 file, then

OSN_BZ2_PATH = '/path/to/notes.osn.bz2'  # <- Change this.
import bz2
osn_reader = bz2.open(OSN_BZ2_PATH, mode='rt', encoding='utf-8')

4. Paste this.

import json
import pathlib


def in_bbox(lat: float, lon: float, w: float, s: float, e: float, n: float) -> bool:
    return s < lat < n and w < lon < e


def in_italy(lat: float, lon: float) -> bool:
    # Example taken from [[Italy#Bounding boxes]].

    # How to determine the coordinates of a bounding box using QGis:
    #   - Layer - Create Layer - New Temporary Scratch Layer - set Geomtry type to Polygon - click Ok
    #   - in the toolbar, make sure that `Toggle Editing` is enabled
    #   - in the toolbar, enable `Add Polygon Feature`
    #   - draw polygons
    #   - in the toolbar, enable `Select Features by Area or Single Click`
    #   - select a polygon, by clicking on it
    #   - paste this in the python console and press Enter
    #       iface.activeLayer().selectedFeatures()[0].geometry().boundingBox().toString()

    return (
        # Italia.
        (
            in_bbox(lat, lon, w=7.48, s=38.35, e=13.90, n=46.66)
            or in_bbox(lat, lon, w=6.58, s=43.98, e=7.70, n=46.12)
            or in_bbox(lat, lon, w=12.79, s=41.85, e=14.78, n=43.20)
            or in_bbox(lat, lon, w=11.87, s=36.28, e=18.65, n=42.26)
            or in_bbox(lat, lon, w=10.26, s=46.45, e=12.86, n=47.20)
        )
        # Corsica.
        and not in_bbox(lat, lon, w=9.7, s=41.34, e=8, n=43)
    )


def print_completion_status(current_note_id: str) -> None:
    total_notes = 4700000
    if int(current_note_id) % 20000 == 0:
        print(
            f"({100 * int(current_note_id) / total_notes:.0f}%)"
            f" Analyzed {current_note_id} of the planet's ~{total_notes} notes."
        )


with pathlib.Path(JSON_PATH).open('w', encoding='utf-8') as json_output:
    _ = json_output.write('[')
    is_first_note = True
    with osn_reader as xml_input:
        for note in notes_from_elementtree(xml_input):
            print_completion_status(note.id_)
            if not in_italy(float(note.lat), float(note.lon)):
                continue
            if note.closed_at is not None:
                # `continue` means to skip the processing of this note
                # and continue with the next note.
                # If you only want to process the open notes, use `continue` here;
                # otherwise, replace `continue` with `pass`.
                continue
            if is_first_note:
                is_first_note = False
            else:
                _ = json_output.write(',')
            _ = json_output.write(json.dumps(dataclasses.asdict(note)))
    _ = json_output.write(']')

From JSON to CSV

1. Follow #From Python generator to JSON.
2. Adapt and paste this

CSV_PATH = '/path/to/output.csv'
JSON_PATH = '/path/to/input.json'

3. Paste this.

import csv
import json
import pathlib


notes = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))


with pathlib.Path(CSV_PATH).open('w', encoding='utf-8') as csv_output:

    # More options to customize the CSV are at
    # https://docs.python.org/3/library/csv.html#csv.writer
    # https://docs.python.org/3/library/csv.html#writer-objects
    writer = csv.writer(csv_output)

    for n in notes:
        first_comment = n['comments'][0]['text']
        author = n['comments'][0]['user'] or 'anonymous'

        _ = writer.writerow([
            n['id_'],
            n['lat'],
            n['lon'],
            first_comment,
            n['created_at'],
            author,
        ])

Note that Microsoft Excel's default CSV import wizard has a well-known issue with newline characters and will not import correctly. To work around it, do any of these.

Per the Microsoft help, double-click the file in the file manager window (also known as Explorer), before launching Excel. Do not use the file open wizard from the File menu.
Import with the Power Query tool. Details here and here. Power Query is under the menu Data and is documented here and here.
Use LibreOffice, visidata, or another standards-compliant spreadsheet.
Replace first_comment with first_comment.replace('\r\n', ' · ').replace('\r', ' · ').replace('\n', ' · ')

Search the JSON

1. Follow #From Python generator to JSON. 2. Adapt and paste this

JSON_PATH = '/path/to/input.json'

3. Paste this.

import json
import pathlib


notes = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))


for n in notes:
    first_comment = n['comments'][0]['text']

    # `in` does a case-sensitive search.
    # To do a case-insensitive search, use a lowercase query
    # and add `.lower()` to the variable to be searched.

    if (
        'bank' in first_comment.lower()
        and 'streetcomplete' not in first_comment.lower()
    ):
        print(f"osm.org/note/{n['id_']}")
        print(first_comment)
        print('\n\n')

Find notes that are close to each other

Produce a list of notes that are very close to each other; they are potential duplicates.

Python

1. Follow #From Python generator to JSON.
2. Adapt and paste this

JSON_PATH = '/path/to/input.json'

3. Paste this.

import json
import pathlib
from collections import defaultdict
from collections.abc import Iterator


Note = dict[str, str]
notes: list[Note] = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))


tiles: dict[float, list[Note]] = defaultdict(list)
"""
Optional optimization to split the area into tiles.

Tune the formula so that each tile contains at most a few hundred notes.

Without this optimization, every note is compared with every other (possibly distant) note.
30000 notes mean ~1 billion comparisons, which takes too much time on the average laptop.
"""
for n in notes:
    tiles[int(float(n['lat']) * 20 + float(n['lon']) * 20)].append(n)


def distant_less_than(distance: float, n1: Note, n2: Note, /) -> bool:
    """Determine whether notes are distant from each other less than `distance`."""
    return (
        abs(float(n1['lat']) - float(n2['lat'])) < distance
        and abs(float(n1['lon']) - float(n2['lon'])) < distance
    )


def notes_close_to_each_other() -> Iterator[tuple[Note, Note]]:
    """Return the notes that are very close to each other."""
    for t in tiles.values():
        for n1 in t:
            for n2 in t:
                if n1['id_'] < n2['id_'] and distant_less_than(0.00002, n1, n2):
                    yield n1, n2


print("┌─────────┬─────────┬────────────┬────────────┐")
print("│   id    │   id    │    lat     │    lon     │")
print("├─────────┼─────────┼────────────┼────────────┤")


for n1, n2 in notes_close_to_each_other():
    print(f"| {n1['id_']} | {n2['id_']} | {n1['lat']} | {n1['lon']} |")

print("└─────────┴─────────┴────────────┴────────────┘")

SQLite

If you have imported the notes in SQLite instead of Python, you can replicate the tiling technique, to reduce the number of comparisons by a few orders of magnitude and keep the query efficient on the average laptop, like this.

   CREATE TEMP TABLE Tiles AS
   SELECT note_id, lat, lon, cast(lat * 20 AS INTEGER) + cast(lon * 20 AS INTEGER) AS tile
     FROM ...;
   
   -- Index for the following query. As always, this was suggested by `.expert`.
   -- Note that the order of the columns is important here, as SQLite's WHERE
   -- implementation only uses indices which have the WHERE's columns as their
   -- first columns.
   CREATE INDEX idx_tiles ON Tiles(tile, lat, lon, note_id);
   
   SELECT *
     FROM Tiles note1 JOIN Tiles note2
    WHERE note1.tile = note2.tile
      AND ABS(note1.lat - note2.lat) < 0.00002 AND ABS(note1.lon - note2.lon) < 0.00002
      AND note1.note_id < note2.note_id;

Notes/Offline analysis

Contents

Python

From .OSN to Python generator

From Python generator to JSON

From JSON to CSV

Search the JSON

Find notes that are close to each other

Python

SQLite

Navigation menu