154 lines
3.9 KiB
Python
154 lines
3.9 KiB
Python
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
import json
|
|
from typing import Iterator, Optional, Dict, Any, Sequence
|
|
from pathlib import Path
|
|
import sqlite3
|
|
|
|
from my.core import LazyLogger, Res, datetime_aware
|
|
from my.core.sqlite import sqlite_copy_and_open
|
|
|
|
|
|
logger = LazyLogger(__name__, level='debug')
|
|
|
|
|
|
def inputs() -> Sequence[Path]:
|
|
db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite'
|
|
# todo eh... kinda pointless to return a list in this case... but maybe ok for consistency?
|
|
# also naming the method input() will conflict with python builtin...
|
|
return [db]
|
|
|
|
|
|
Url = str
|
|
|
|
@dataclass(frozen=True)
|
|
class Item:
|
|
"""Corresponds to 'Zotero item'"""
|
|
file: Path
|
|
title: str
|
|
url: Optional[Url]
|
|
|
|
|
|
@dataclass
|
|
class Annotation:
|
|
item: Item
|
|
added: datetime_aware
|
|
# checked it and it's definitely utc
|
|
|
|
page: int
|
|
"""0-indexed"""
|
|
|
|
text: Optional[str]
|
|
comment: Optional[str]
|
|
tags: Sequence[str]
|
|
|
|
|
|
def annotations() -> Iterator[Res[Annotation]]:
|
|
for r in _query_raw():
|
|
if isinstance(r, Exception):
|
|
yield r
|
|
continue
|
|
try:
|
|
a = _parse_annotation(r)
|
|
yield a
|
|
except Exception as e:
|
|
yield e
|
|
|
|
|
|
# type -- 1 is inline; 2 is note?
|
|
# todo color? -- for org-mode could map into priority?
|
|
_QUERY = '''
|
|
SELECT A.itemID, A.parentItemID, text, comment, position, path, dateAdded
|
|
FROM itemAnnotations AS A
|
|
LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID
|
|
LEFT JOIN items AS I ON A.itemID = I.itemID
|
|
'''
|
|
|
|
|
|
_QUERY_TAGS = '''
|
|
SELECT name
|
|
FROM itemTags AS IT
|
|
LEFT JOIN tags as T ON IT.tagID = T.tagID
|
|
WHERE itemID = ?
|
|
'''.strip()
|
|
|
|
|
|
_QUERY_TITLE = '''
|
|
SELECT value AS title
|
|
FROM itemData AS ID
|
|
LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
|
|
WHERE ID.fieldID = 1 AND itemID = ?
|
|
'''.strip()
|
|
|
|
|
|
_QUERY_URL = '''
|
|
SELECT value AS url FROM
|
|
itemData AS ID
|
|
LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
|
|
LEFT JOIN itemAttachments AS IA ON ID.itemID == IA.parentItemID
|
|
WHERE ID.fieldID = 13 AND IA.itemID = ?
|
|
'''.strip()
|
|
|
|
|
|
# TODO maybe exclude 'private' methods from detection?
|
|
def _query_raw() -> Iterator[Res[Dict[str, Any]]]:
|
|
[db] = inputs()
|
|
|
|
with sqlite_copy_and_open(db) as conn:
|
|
conn.row_factory = sqlite3.Row
|
|
for r in conn.execute(_QUERY):
|
|
try:
|
|
yield _enrich_row(r, conn=conn)
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
ex = RuntimeError(f'Error while processing {list(r)}')
|
|
ex.__cause__ = e
|
|
yield ex
|
|
|
|
|
|
def _enrich_row(r, conn: sqlite3.Connection):
|
|
r = dict(r)
|
|
# TODO very messy -- would be nice to do this with less queries
|
|
# tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite..
|
|
iid = r['itemID']
|
|
tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])]
|
|
r['tags'] = tags
|
|
|
|
# TODO also need item tags
|
|
|
|
pid = r['parentItemID']
|
|
[title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])]
|
|
r['title'] = title
|
|
|
|
murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])]
|
|
url = None if len(murl) == 0 else murl[0]
|
|
r['url'] = url
|
|
return r
|
|
|
|
|
|
def _parse_annotation(r: Dict) -> Annotation:
|
|
text = r['text']
|
|
comment = r['comment']
|
|
# todo use json query for this?
|
|
page = json.loads(r['position'])['pageIndex']
|
|
path = r['path']
|
|
addeds = r['dateAdded']
|
|
tags = r['tags']
|
|
|
|
added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S')
|
|
added = added.replace(tzinfo=timezone.utc)
|
|
|
|
item = Item(
|
|
file=Path(path), # path is a bit misleading... could mean some internal DOM path?
|
|
title=r['title'],
|
|
url=r['url'],
|
|
)
|
|
|
|
return Annotation(
|
|
item=item,
|
|
added=added,
|
|
page=page,
|
|
text=text,
|
|
comment=comment,
|
|
tags=tags,
|
|
)
|