my.zotero: initial version

This commit is contained in:
Dima Gerasimov 2021-04-04 22:09:29 +01:00 committed by karlicoss
parent c1b70cd90e
commit 1ef2c5619e

154
my/zotero.py Normal file
View file

@ -0,0 +1,154 @@
from dataclasses import dataclass
from datetime import datetime, timezone
import json
from typing import Iterator, Optional, Dict, Any, Sequence
from pathlib import Path
import sqlite3
from my.core import LazyLogger, Res, datetime_aware
from my.core.sqlite import sqlite_copy_and_open
logger = LazyLogger(__name__, level='debug')
def inputs() -> Sequence[Path]:
db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite'
# todo eh... kinda pointless to return a list in this case... but maybe ok for consistency?
# also naming the method input() will conflict with python builtin...
return [db]
Url = str
@dataclass(frozen=True)
class Item:
"""Corresponds to 'Zotero item'"""
file: Path
title: str
url: Optional[Url]
@dataclass
class Annotation:
item: Item
added: datetime_aware
# checked it and it's definitely utc
page: int
"""0-indexed"""
text: Optional[str]
comment: Optional[str]
tags: Sequence[str]
def annotations() -> Iterator[Res[Annotation]]:
for r in _query_raw():
if isinstance(r, Exception):
yield r
continue
try:
a = _parse_annotation(r)
yield a
except Exception as e:
yield e
# type -- 1 is inline; 2 is note?
# todo color? -- for org-mode could map into priority?
_QUERY = '''
SELECT A.itemID, A.parentItemID, text, comment, position, path, dateAdded
FROM itemAnnotations AS A
LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID
LEFT JOIN items AS I ON A.itemID = I.itemID
'''
_QUERY_TAGS = '''
SELECT name
FROM itemTags AS IT
LEFT JOIN tags as T ON IT.tagID = T.tagID
WHERE itemID = ?
'''.strip()
_QUERY_TITLE = '''
SELECT value AS title
FROM itemData AS ID
LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
WHERE ID.fieldID = 1 AND itemID = ?
'''.strip()
_QUERY_URL = '''
SELECT value AS url FROM
itemData AS ID
LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
LEFT JOIN itemAttachments AS IA ON ID.itemID == IA.parentItemID
WHERE ID.fieldID = 13 AND IA.itemID = ?
'''.strip()
# TODO maybe exclude 'private' methods from detection?
def _query_raw() -> Iterator[Res[Dict[str, Any]]]:
[db] = inputs()
with sqlite_copy_and_open(db) as conn:
conn.row_factory = sqlite3.Row
for r in conn.execute(_QUERY):
try:
yield _enrich_row(r, conn=conn)
except Exception as e:
logger.exception(e)
ex = RuntimeError(f'Error while processing {list(r)}')
ex.__cause__ = e
yield ex
def _enrich_row(r, conn: sqlite3.Connection):
r = dict(r)
# TODO very messy -- would be nice to do this with less queries
# tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite..
iid = r['itemID']
tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])]
r['tags'] = tags
# TODO also need item tags
pid = r['parentItemID']
[title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])]
r['title'] = title
murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])]
url = None if len(murl) == 0 else murl[0]
r['url'] = url
return r
def _parse_annotation(r: Dict) -> Annotation:
text = r['text']
comment = r['comment']
# todo use json query for this?
page = json.loads(r['position'])['pageIndex']
path = r['path']
addeds = r['dateAdded']
tags = r['tags']
added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S')
added = added.replace(tzinfo=timezone.utc)
item = Item(
file=Path(path), # path is a bit misleading... could mean some internal DOM path?
title=r['title'],
url=r['url'],
)
return Annotation(
item=item,
added=added,
page=page,
text=text,
comment=comment,
tags=tags,
)