diff --git a/my/zotero.py b/my/zotero.py new file mode 100644 index 0000000..dfd1078 --- /dev/null +++ b/my/zotero.py @@ -0,0 +1,154 @@ +from dataclasses import dataclass +from datetime import datetime, timezone +import json +from typing import Iterator, Optional, Dict, Any, Sequence +from pathlib import Path +import sqlite3 + +from my.core import LazyLogger, Res, datetime_aware +from my.core.sqlite import sqlite_copy_and_open + + +logger = LazyLogger(__name__, level='debug') + + +def inputs() -> Sequence[Path]: + db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite' + # todo eh... kinda pointless to return a list in this case... but maybe ok for consistency? + # also naming the method input() will conflict with python builtin... + return [db] + + +Url = str + +@dataclass(frozen=True) +class Item: + """Corresponds to 'Zotero item'""" + file: Path + title: str + url: Optional[Url] + + +@dataclass +class Annotation: + item: Item + added: datetime_aware + # checked it and it's definitely utc + + page: int + """0-indexed""" + + text: Optional[str] + comment: Optional[str] + tags: Sequence[str] + + +def annotations() -> Iterator[Res[Annotation]]: + for r in _query_raw(): + if isinstance(r, Exception): + yield r + continue + try: + a = _parse_annotation(r) + yield a + except Exception as e: + yield e + + +# type -- 1 is inline; 2 is note? +# todo color? -- for org-mode could map into priority? +_QUERY = ''' +SELECT A.itemID, A.parentItemID, text, comment, position, path, dateAdded +FROM itemAnnotations AS A +LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID +LEFT JOIN items AS I ON A.itemID = I.itemID +''' + + +_QUERY_TAGS = ''' +SELECT name +FROM itemTags AS IT +LEFT JOIN tags as T ON IT.tagID = T.tagID +WHERE itemID = ? +'''.strip() + + +_QUERY_TITLE = ''' +SELECT value AS title +FROM itemData AS ID +LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID +WHERE ID.fieldID = 1 AND itemID = ? +'''.strip() + + +_QUERY_URL = ''' +SELECT value AS url FROM +itemData AS ID +LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID +LEFT JOIN itemAttachments AS IA ON ID.itemID == IA.parentItemID +WHERE ID.fieldID = 13 AND IA.itemID = ? +'''.strip() + + +# TODO maybe exclude 'private' methods from detection? +def _query_raw() -> Iterator[Res[Dict[str, Any]]]: + [db] = inputs() + + with sqlite_copy_and_open(db) as conn: + conn.row_factory = sqlite3.Row + for r in conn.execute(_QUERY): + try: + yield _enrich_row(r, conn=conn) + except Exception as e: + logger.exception(e) + ex = RuntimeError(f'Error while processing {list(r)}') + ex.__cause__ = e + yield ex + + +def _enrich_row(r, conn: sqlite3.Connection): + r = dict(r) + # TODO very messy -- would be nice to do this with less queries + # tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite.. + iid = r['itemID'] + tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])] + r['tags'] = tags + + # TODO also need item tags + + pid = r['parentItemID'] + [title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])] + r['title'] = title + + murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])] + url = None if len(murl) == 0 else murl[0] + r['url'] = url + return r + + +def _parse_annotation(r: Dict) -> Annotation: + text = r['text'] + comment = r['comment'] + # todo use json query for this? + page = json.loads(r['position'])['pageIndex'] + path = r['path'] + addeds = r['dateAdded'] + tags = r['tags'] + + added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S') + added = added.replace(tzinfo=timezone.utc) + + item = Item( + file=Path(path), # path is a bit misleading... could mean some internal DOM path? + title=r['title'], + url=r['url'], + ) + + return Annotation( + item=item, + added=added, + page=page, + text=text, + comment=comment, + tags=tags, + )