from dataclasses import dataclass from datetime import datetime, timezone import json from typing import Iterator, Optional, Dict, Any, Sequence from pathlib import Path import sqlite3 from my.core import make_logger, Res, datetime_aware from my.core.sqlite import sqlite_copy_and_open logger = make_logger(__name__) def inputs() -> Sequence[Path]: db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite' # todo eh... kinda pointless to return a list in this case... but maybe ok for consistency? # also naming the method input() will conflict with python builtin... return [db] Url = str @dataclass(frozen=True) class Item: """Corresponds to 'Zotero item'""" file: Path title: str url: Optional[Url] tags: Sequence[str] @dataclass class Annotation: item: Item added: datetime_aware # checked it and it's definitely utc page: int """0-indexed""" text: Optional[str] comment: Optional[str] tags: Sequence[str] color_hex: str """Original hex-encoded color in zotero""" @property def color_human(self) -> str: return _hex2human(self.color_hex) def annotations() -> Iterator[Res[Annotation]]: for r in _query_raw(): if isinstance(r, Exception): yield r continue try: a = _parse_annotation(r) yield a except Exception as e: yield e # type -- 1 is inline; 2 is note? _QUERY = ''' SELECT A.itemID, A.parentItemID, F.parentItemID AS topItemID, text, comment, color, position, path, dateAdded FROM itemAnnotations AS A LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID LEFT JOIN items AS I ON A.itemID = I.itemID ''' _QUERY_TAGS = ''' SELECT name FROM itemTags AS IT LEFT JOIN tags as T ON IT.tagID = T.tagID WHERE itemID = ? '''.strip() _QUERY_TITLE = ''' SELECT value AS title FROM itemData AS ID LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID WHERE ID.fieldID = 1 AND itemID = ? '''.strip() _QUERY_URL = ''' SELECT value AS url FROM itemData AS ID LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID LEFT JOIN itemAttachments AS IA ON ID.itemID == IA.parentItemID WHERE ID.fieldID = 13 AND IA.itemID = ? '''.strip() # TODO maybe exclude 'private' methods from detection? def _query_raw() -> Iterator[Res[Dict[str, Any]]]: [db] = inputs() with sqlite_copy_and_open(db) as conn: conn.row_factory = sqlite3.Row for r in conn.execute(_QUERY): try: yield _enrich_row(r, conn=conn) except Exception as e: logger.exception(e) ex = RuntimeError(f'Error while processing {list(r)}') ex.__cause__ = e yield ex conn.close() # the data mode in zotero database seems as follows.. # # itemAnnotations # - itemId is the annotation itself # - parentItemId is the PDF file, corresponds to itemAttachments.itemId?? # # itemAttachments # - itemId # - parentItemId is just the 'abstract' top level item in zotero # this top level item is the one that shows up in the file list? ugh also some indirection in itemNotes... # def _enrich_row(r, conn: sqlite3.Connection): r = dict(r) # TODO very messy -- would be nice to do this with less queries # tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite.. iid = r['itemID'] tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])] r['tags'] = tuple(tags) topid = r['topItemID'] top_tags = [row[0] for row in conn.execute(_QUERY_TAGS, [topid])] r['top_tags'] = tuple(top_tags) pid = r['parentItemID'] [title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])] r['title'] = title murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])] url = None if len(murl) == 0 else murl[0] r['url'] = url return r def _hex2human(color_hex: str) -> str: return { '#ffd400': 'yellow', '#a28ae5': 'purple', '#5fb236': 'green' , '#ff6666': 'red' , '#2ea8e5': 'blue' , }.get(color_hex, color_hex) def _parse_annotation(r: Dict) -> Annotation: text = r['text'] comment = r['comment'] # todo use json query for this? page = json.loads(r['position'])['pageIndex'] path = r['path'] addeds = r['dateAdded'] tags = r['tags'] color_hex= r['color'] added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S') added = added.replace(tzinfo=timezone.utc) item = Item( file=Path(path), # path is a bit misleading... could mean some internal DOM path? title=r['title'], url=r['url'], tags=r['top_tags'] ) return Annotation( item=item, added=added, page=page, text=text, comment=comment, tags=tags, color_hex=color_hex, )