my.zotero: initial version

2021-04-04 22:09:29 +01:00 · 2021-04-04 22:09:29 +01:00 · 1ef2c5619e
commit 1ef2c5619e
parent c1b70cd90e
1 changed files with 154 additions and 0 deletions
--- a/my/zotero.py
+++ b/my/zotero.py
@ -0,0 +1,154 @@
+from dataclasses import dataclass
+from datetime import datetime, timezone
+import json
+from typing import Iterator, Optional, Dict, Any, Sequence
+from pathlib import Path
+import sqlite3
+
+from my.core import LazyLogger, Res, datetime_aware
+from my.core.sqlite import sqlite_copy_and_open
+
+
+logger = LazyLogger(__name__, level='debug')
+
+
+def inputs() -> Sequence[Path]:
+    db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite'
+    # todo eh... kinda pointless to return a list in this case... but maybe ok for consistency?
+    # also naming the method input() will conflict with python builtin...
+    return [db]
+
+
+Url = str
+
+@dataclass(frozen=True)
+class Item:
+    """Corresponds to 'Zotero item'"""
+    file: Path
+    title: str
+    url: Optional[Url]
+
+
+@dataclass
+class Annotation:
+    item: Item
+    added: datetime_aware
+    # checked it and it's definitely utc
+
+    page: int
+    """0-indexed"""
+
+    text: Optional[str]
+    comment: Optional[str]
+    tags: Sequence[str]
+
+
+def annotations() -> Iterator[Res[Annotation]]:
+    for r in _query_raw():
+        if isinstance(r, Exception):
+            yield r
+            continue
+        try:
+            a = _parse_annotation(r)
+            yield a
+        except Exception as e:
+            yield e
+
+
+# type -- 1 is inline; 2 is note?
+# todo color? -- for org-mode could map into priority?
+_QUERY = '''
+SELECT A.itemID, A.parentItemID, text, comment, position, path, dateAdded
+FROM itemAnnotations AS A
+LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID
+LEFT JOIN items AS I           ON A.itemID = I.itemID
+'''
+
+
+_QUERY_TAGS = '''
+SELECT name
+FROM itemTags AS IT
+LEFT JOIN tags as T ON IT.tagID = T.tagID
+WHERE itemID = ?
+'''.strip()
+
+
+_QUERY_TITLE = '''
+SELECT value AS title
+FROM itemData AS ID
+LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
+WHERE ID.fieldID = 1 AND itemID = ?
+'''.strip()
+
+
+_QUERY_URL = '''
+SELECT value AS url FROM
+itemData AS ID
+LEFT JOIN itemDataValues  AS IDV ON ID.valueID == IDV.valueID
+LEFT JOIN itemAttachments AS IA  ON ID.itemID  == IA.parentItemID
+WHERE ID.fieldID = 13 AND IA.itemID = ?
+'''.strip()
+
+
+# TODO maybe exclude 'private' methods from detection?
+def _query_raw() -> Iterator[Res[Dict[str, Any]]]:
+    [db] = inputs()
+
+    with sqlite_copy_and_open(db) as conn:
+        conn.row_factory = sqlite3.Row
+        for r in conn.execute(_QUERY):
+            try:
+                yield _enrich_row(r, conn=conn)
+            except Exception as e:
+                logger.exception(e)
+                ex = RuntimeError(f'Error while processing {list(r)}')
+                ex.__cause__ = e
+                yield ex
+
+
+def _enrich_row(r, conn: sqlite3.Connection):
+    r = dict(r)
+    # TODO very messy -- would be nice to do this with less queries
+    # tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite..
+    iid = r['itemID']
+    tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])]
+    r['tags'] = tags
+
+    # TODO also need item tags
+
+    pid = r['parentItemID']
+    [title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])]
+    r['title'] = title
+
+    murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])]
+    url = None if len(murl) == 0 else murl[0]
+    r['url'] = url
+    return r
+
+
+def _parse_annotation(r: Dict) -> Annotation:
+    text    = r['text']
+    comment = r['comment']
+    # todo use json query for this?
+    page = json.loads(r['position'])['pageIndex']
+    path    = r['path']
+    addeds  = r['dateAdded']
+    tags    = r['tags']
+
+    added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S')
+    added = added.replace(tzinfo=timezone.utc)
+
+    item = Item(
+        file=Path(path),  # path is a bit misleading... could mean some internal DOM path?
+        title=r['title'],
+        url=r['url'],
+    )
+
+    return Annotation(
+        item=item,
+        added=added,
+        page=page,
+        text=text,
+        comment=comment,
+        tags=tags,
+    )