from dataclasses import dataclass from datetime import datetime, timezone import json from typing import Iterator, Optional, Dict, Any, Sequence from pathlib import Path import sqlite3 from my.core import LazyLogger, Res, datetime_aware from my.core.sqlite import sqlite_copy_and_open logger = LazyLogger(__name__, level='debug') def inputs() -> Sequence[Path]: db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite' # todo eh... kinda pointless to return a list in this case... but maybe ok for consistency? # also naming the method input() will conflict with python builtin... return [db] Url = str @dataclass(frozen=True) class Item: """Corresponds to 'Zotero item'""" file: Path title: str url: Optional[Url] tags: Sequence[str] @dataclass class Annotation: item: Item added: datetime_aware # checked it and it's definitely utc page: int """0-indexed""" text: Optional[str] comment: Optional[str] tags: Sequence[str] color_hex: str """Original hex-encoded color in zotero""" @property def color_human(self) -> str: return _hex2human(self.color_hex) def annotations() -> Iterator[Res[Annotation]]: for r in _query_raw(): if isinstance(r, Exception): yield r continue try: a = _parse_annotation(r) yield a except Exception as e: yield e # type -- 1 is inline; 2 is note? _QUERY = ''' SELECT A.itemID, A.parentItemID, F.parentItemID AS topItemID, text, comment, color, position, path, dateAdded FROM itemAnnotations AS A LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID LEFT JOIN items AS I ON A.itemID = I.itemID ''' _QUERY_TAGS = ''' SELECT name FROM itemTags AS IT LEFT JOIN tags as T ON IT.tagID = T.tagID WHERE itemID = ? '''.strip() _QUERY_TITLE = ''' SELECT value AS title FROM itemData AS ID LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID WHERE ID.fieldID = 1 AND itemID = ? '''.strip() _QUERY_URL = ''' SELECT value AS url FROM itemData AS ID LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID LEFT JOIN itemAttachments AS IA ON ID.itemID == IA.parentItemID WHERE ID.fieldID = 13 AND IA.itemID = ? '''.strip() # TODO maybe exclude 'private' methods from detection? def _query_raw() -> Iterator[Res[Dict[str, Any]]]: [db] = inputs() with sqlite_copy_and_open(db) as conn: conn.row_factory = sqlite3.Row for r in conn.execute(_QUERY): try: yield _enrich_row(r, conn=conn) except Exception as e: logger.exception(e) ex = RuntimeError(f'Error while processing {list(r)}') ex.__cause__ = e yield ex conn.close() # the data mode in zotero database seems as follows.. # # itemAnnotations # - itemId is the annotation itself # - parentItemId is the PDF file, corresponds to itemAttachments.itemId?? # # itemAttachments # - itemId # - parentItemId is just the 'abstract' top level item in zotero # this top level item is the one that shows up in the file list? ugh also some indirection in itemNotes... # def _enrich_row(r, conn: sqlite3.Connection): r = dict(r) # TODO very messy -- would be nice to do this with less queries # tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite.. iid = r['itemID'] tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])] r['tags'] = tuple(tags) topid = r['topItemID'] top_tags = [row[0] for row in conn.execute(_QUERY_TAGS, [topid])] r['top_tags'] = tuple(top_tags) pid = r['parentItemID'] [title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])] r['title'] = title murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])] url = None if len(murl) == 0 else murl[0] r['url'] = url return r def _hex2human(color_hex: str) -> str: return { '#ffd400': 'yellow', '#a28ae5': 'purple', '#5fb236': 'green' , '#ff6666': 'red' , '#2ea8e5': 'blue' , }.get(color_hex, color_hex) def _parse_annotation(r: Dict) -> Annotation: text = r['text'] comment = r['comment'] # todo use json query for this? page = json.loads(r['position'])['pageIndex'] path = r['path'] addeds = r['dateAdded'] tags = r['tags'] color_hex= r['color'] added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S') added = added.replace(tzinfo=timezone.utc) item = Item( file=Path(path), # path is a bit misleading... could mean some internal DOM path? title=r['title'], url=r['url'], tags=r['top_tags'] ) return Annotation( item=item, added=added, page=page, text=text, comment=comment, tags=tags, color_hex=color_hex, )