Notes/Offline analysis

From OpenStreetMap Wiki
Jump to navigation Jump to search

Here are some snippets to work with the notes dump described at Notes#Download_notes.

Python

Either 1. run python3 and copy-paste these snippets, or 2. make an executable script by pasting the line #!/usr/bin/env python3 followed by these snippets.

From .OSN to Python generator

from __future__ import annotations


import dataclasses
from collections.abc import Iterator
from dataclasses import dataclass
from typing import Literal
from xml.etree import ElementTree


@dataclass(frozen=True)
class Note:
    """A note is a sequence of comments at given coordinates.

    To determine whether a note is open or closed, check that `closed_at` is None.
    You may be tempted to sort comments by timestamp and
    look at the `action` of the comment with highest timestamp,
    but note that

        - some notes have a comment with action = 'commented' and one with
          action = 'closed' at the same timestamp (instead of having a single
          comment that both closes the note and provides the closing comment)

        - some notes have a comment with action = 'closed' and a comment
          with action = 'reopened' at the same timestamp (resolved and reactivated
          within a second)
    """

    id_: str

    comments: tuple[Comment, ...]

    lat: str

    lon: str

    created_at: str

    closed_at: str | None
    """None means that it was missing in the source data."""


@dataclass(frozen=True)
class Comment:
    """A comment is something that happens on a note."""

    note_id: str

    action: Literal['opened', 'closed', 'reopened', 'commented']

    text: str | None
    """None means that it was missing in the source data."""

    timestamp: str

    uid: str | None
    """None means that it was missing in the source data."""

    user: str | None
    """None means that it was missing in the source data."""


def notes_from_elementtree(f) -> Iterator[Note]:
    """Return notes from ElementTree parsing.

    :param f:
        File-like object.
    """
    parser = ElementTree.iterparse(f, events=['end'])

    node: ElementTree.Element
    for _, node in parser:
        if node.tag == 'note':
            yield Note(
                id_=node.attrib['id'],
                lat=node.attrib['lat'],
                lon=node.attrib['lon'],
                created_at=node.attrib['created_at'],
                closed_at=node.get('closed_at', None),
                comments=tuple(
                    Comment(
                        note_id=node.attrib['id'],
                        action=comment_node.attrib['action'],
                        text=comment_node.text,
                        timestamp=comment_node.attrib['timestamp'],
                        uid=comment_node.attrib.get('uid', None),
                        user=comment_node.attrib.get('user', None),
                    )
                    for comment_node in node.findall('comment')
                ),
            )
            # Per <stackoverflow.com/a/9814580>, clear the node,
            # so the RAM is not filled with unused notes.
            # This is necessary, because the XML file is too big
            # to keep the parsed contents in memory all at once.
            # Note that this is under `if node.tag == 'note'`
            # so that it does not clear the `<comment>` tags
            # before the `end` event of their containing `<note>`
            # is hit.
            node.clear()

From Python generator to JSON

1. Copy the code from #From .OSN to Python generator.
2. Adapt and paste this

JSON_PATH = '/path/to/output.json'

3. If you are working with an *.osn file, then

OSN_PATH = '/path/to/notes.osn'  # <- Change this.
osn_reader = pathlib.Path(OSN_PATH).open(encoding='utf-8')

Otherwise, if you are working with a compressed *.osn.bz2 file, then

OSN_BZ2_PATH = '/path/to/notes.osn.bz2'  # <- Change this.
import bz2
osn_reader = bz2.open(OSN_BZ2_PATH, mode='rt', encoding='utf-8')

4. Paste this.

import json
import pathlib


def in_bbox(lat: float, lon: float, w: float, s: float, e: float, n: float) -> bool:
    return s < lat < n and w < lon < e


def in_italy(lat: float, lon: float) -> bool:
    # Example taken from [[Italy#Bounding boxes]].

    # How to determine the coordinates of a bounding box using QGis:
    #   - Layer - Create Layer - New Temporary Scratch Layer - set Geomtry type to Polygon - click Ok
    #   - in the toolbar, make sure that `Toggle Editing` is enabled
    #   - in the toolbar, enable `Add Polygon Feature`
    #   - draw polygons
    #   - in the toolbar, enable `Select Features by Area or Single Click`
    #   - select a polygon, by clicking on it
    #   - paste this in the python console and press Enter
    #       iface.activeLayer().selectedFeatures()[0].geometry().boundingBox().toString()

    return (
        # Italia.
        (
            in_bbox(lat, lon, w=7.48, s=38.35, e=13.90, n=46.66)
            or in_bbox(lat, lon, w=6.58, s=43.98, e=7.70, n=46.12)
            or in_bbox(lat, lon, w=12.79, s=41.85, e=14.78, n=43.20)
            or in_bbox(lat, lon, w=11.87, s=36.28, e=18.65, n=42.26)
            or in_bbox(lat, lon, w=10.26, s=46.45, e=12.86, n=47.20)
        )
        # Corsica.
        and not in_bbox(lat, lon, w=9.7, s=41.34, e=8, n=43)
    )


def print_completion_status(current_note_id: str) -> None:
    total_notes = 4700000
    if int(current_note_id) % 20000 == 0:
        print(
            f"({100 * int(current_note_id) / total_notes:.0f}%)"
            f" Analyzed {current_note_id} of the planet's ~{total_notes} notes."
        )


with pathlib.Path(JSON_PATH).open('w', encoding='utf-8') as json_output:
    _ = json_output.write('[')
    is_first_note = True
    with osn_reader as xml_input:
        for note in notes_from_elementtree(xml_input):
            print_completion_status(note.id_)
            if not in_italy(float(note.lat), float(note.lon)):
                continue
            if note.closed_at is not None:
                # `continue` means to skip the processing of this note
                # and continue with the next note.
                # If you only want to process the open notes, use `continue` here;
                # otherwise, replace `continue` with `pass`.
                continue
            if is_first_note:
                is_first_note = False
            else:
                _ = json_output.write(',')
            _ = json_output.write(json.dumps(dataclasses.asdict(note)))
    _ = json_output.write(']')

From JSON to CSV

1. Follow #From Python generator to JSON.
2. Adapt and paste this

CSV_PATH = '/path/to/output.csv'
JSON_PATH = '/path/to/input.json'

3. Paste this.

import csv
import json
import pathlib


notes = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))


with pathlib.Path(CSV_PATH).open('w', encoding='utf-8') as csv_output:

    # More options to customize the CSV are at
    # https://docs.python.org/3/library/csv.html#csv.writer
    # https://docs.python.org/3/library/csv.html#writer-objects
    writer = csv.writer(csv_output)

    for n in notes:
        first_comment = n['comments'][0]['text']
        author = n['comments'][0]['user'] or 'anonymous'

        _ = writer.writerow([
            n['id_'],
            n['lat'],
            n['lon'],
            first_comment,
            n['created_at'],
            author,
        ])

Note that Microsoft Excel's default CSV import wizard has a well-known issue with newline characters and will not import correctly. To work around it, do any of these.

  • Per the Microsoft help, double-click the file in the file manager window (also known as Explorer), before launching Excel. Do not use the file open wizard from the File menu.
  • Import with the Power Query tool. Details here and here. Power Query is under the menu Data and is documented here and here.
  • Use LibreOffice, visidata, or another standards-compliant spreadsheet.
  • Replace first_comment with first_comment.replace('\r\n', ' · ').replace('\r', ' · ').replace('\n', ' · ')

Search the JSON

1. Follow #From Python generator to JSON. 2. Adapt and paste this

JSON_PATH = '/path/to/input.json'

3. Paste this.

import json
import pathlib


notes = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))


for n in notes:
    first_comment = n['comments'][0]['text']

    # `in` does a case-sensitive search.
    # To do a case-insensitive search, use a lowercase query
    # and add `.lower()` to the variable to be searched.

    if (
        'bank' in first_comment.lower()
        and 'streetcomplete' not in first_comment.lower()
    ):
        print(f"osm.org/note/{n['id_']}")
        print(first_comment)
        print('\n\n')


Find notes that are close to each other

Produce a list of notes that are very close to each other; they are potential duplicates.

Python

1. Follow #From Python generator to JSON.
2. Adapt and paste this

JSON_PATH = '/path/to/input.json'

3. Paste this.

import json
import pathlib
from collections import defaultdict
from collections.abc import Iterator


Note = dict[str, str]
notes: list[Note] = json.loads(pathlib.Path(JSON_PATH).read_text(encoding='utf-8'))


tiles: dict[float, list[Note]] = defaultdict(list)
"""
Optional optimization to split the area into tiles.

Tune the formula so that each tile contains at most a few hundred notes.

Without this optimization, every note is compared with every other (possibly distant) note.
30000 notes mean ~1 billion comparisons, which takes too much time on the average laptop.
"""
for n in notes:
    tiles[int(float(n['lat']) * 20 + float(n['lon']) * 20)].append(n)


def distant_less_than(distance: float, n1: Note, n2: Note, /) -> bool:
    """Determine whether notes are distant from each other less than `distance`."""
    return (
        abs(float(n1['lat']) - float(n2['lat'])) < distance
        and abs(float(n1['lon']) - float(n2['lon'])) < distance
    )


def notes_close_to_each_other() -> Iterator[tuple[Note, Note]]:
    """Return the notes that are very close to each other."""
    for t in tiles.values():
        for n1 in t:
            for n2 in t:
                if n1['id_'] < n2['id_'] and distant_less_than(0.00002, n1, n2):
                    yield n1, n2


print("┌─────────┬─────────┬────────────┬────────────┐")
print("│   id    │   id    │    lat     │    lon     │")
print("├─────────┼─────────┼────────────┼────────────┤")


for n1, n2 in notes_close_to_each_other():
    print(f"| {n1['id_']} | {n2['id_']} | {n1['lat']} | {n1['lon']} |")

print("└─────────┴─────────┴────────────┴────────────┘")

SQLite

If you have imported the notes in SQLite instead of Python, you can replicate the tiling technique, to reduce the number of comparisons by a few orders of magnitude and keep the query efficient on the average laptop, like this.

   CREATE TEMP TABLE Tiles AS
   SELECT note_id, lat, lon, cast(lat * 20 AS INTEGER) + cast(lon * 20 AS INTEGER) AS tile
     FROM ...;
   
   -- Index for the following query. As always, this was suggested by `.expert`.
   -- Note that the order of the columns is important here, as SQLite's WHERE
   -- implementation only uses indices which have the WHERE's columns as their
   -- first columns.
   CREATE INDEX idx_tiles ON Tiles(tile, lat, lon, note_id);
   
   SELECT *
     FROM Tiles note1 JOIN Tiles note2
    WHERE note1.tile = note2.tile
      AND ABS(note1.lat - note2.lat) < 0.00002 AND ABS(note1.lon - note2.lon) < 0.00002
      AND note1.note_id < note2.note_id;