my.zotero: initial version
This commit is contained in:
parent
c1b70cd90e
commit
1ef2c5619e
1 changed files with 154 additions and 0 deletions
154
my/zotero.py
Normal file
154
my/zotero.py
Normal file
|
@ -0,0 +1,154 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Iterator, Optional, Dict, Any, Sequence
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
|
||||
from my.core import LazyLogger, Res, datetime_aware
|
||||
from my.core.sqlite import sqlite_copy_and_open
|
||||
|
||||
|
||||
logger = LazyLogger(__name__, level='debug')
|
||||
|
||||
|
||||
def inputs() -> Sequence[Path]:
|
||||
db = Path('~').expanduser() / 'Zotero' / 'zotero.sqlite'
|
||||
# todo eh... kinda pointless to return a list in this case... but maybe ok for consistency?
|
||||
# also naming the method input() will conflict with python builtin...
|
||||
return [db]
|
||||
|
||||
|
||||
Url = str
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Item:
|
||||
"""Corresponds to 'Zotero item'"""
|
||||
file: Path
|
||||
title: str
|
||||
url: Optional[Url]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Annotation:
|
||||
item: Item
|
||||
added: datetime_aware
|
||||
# checked it and it's definitely utc
|
||||
|
||||
page: int
|
||||
"""0-indexed"""
|
||||
|
||||
text: Optional[str]
|
||||
comment: Optional[str]
|
||||
tags: Sequence[str]
|
||||
|
||||
|
||||
def annotations() -> Iterator[Res[Annotation]]:
|
||||
for r in _query_raw():
|
||||
if isinstance(r, Exception):
|
||||
yield r
|
||||
continue
|
||||
try:
|
||||
a = _parse_annotation(r)
|
||||
yield a
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
|
||||
# type -- 1 is inline; 2 is note?
|
||||
# todo color? -- for org-mode could map into priority?
|
||||
_QUERY = '''
|
||||
SELECT A.itemID, A.parentItemID, text, comment, position, path, dateAdded
|
||||
FROM itemAnnotations AS A
|
||||
LEFT JOIN itemAttachments AS F ON A.parentItemID = F.ItemID
|
||||
LEFT JOIN items AS I ON A.itemID = I.itemID
|
||||
'''
|
||||
|
||||
|
||||
_QUERY_TAGS = '''
|
||||
SELECT name
|
||||
FROM itemTags AS IT
|
||||
LEFT JOIN tags as T ON IT.tagID = T.tagID
|
||||
WHERE itemID = ?
|
||||
'''.strip()
|
||||
|
||||
|
||||
_QUERY_TITLE = '''
|
||||
SELECT value AS title
|
||||
FROM itemData AS ID
|
||||
LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
|
||||
WHERE ID.fieldID = 1 AND itemID = ?
|
||||
'''.strip()
|
||||
|
||||
|
||||
_QUERY_URL = '''
|
||||
SELECT value AS url FROM
|
||||
itemData AS ID
|
||||
LEFT JOIN itemDataValues AS IDV ON ID.valueID == IDV.valueID
|
||||
LEFT JOIN itemAttachments AS IA ON ID.itemID == IA.parentItemID
|
||||
WHERE ID.fieldID = 13 AND IA.itemID = ?
|
||||
'''.strip()
|
||||
|
||||
|
||||
# TODO maybe exclude 'private' methods from detection?
|
||||
def _query_raw() -> Iterator[Res[Dict[str, Any]]]:
|
||||
[db] = inputs()
|
||||
|
||||
with sqlite_copy_and_open(db) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
for r in conn.execute(_QUERY):
|
||||
try:
|
||||
yield _enrich_row(r, conn=conn)
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
ex = RuntimeError(f'Error while processing {list(r)}')
|
||||
ex.__cause__ = e
|
||||
yield ex
|
||||
|
||||
|
||||
def _enrich_row(r, conn: sqlite3.Connection):
|
||||
r = dict(r)
|
||||
# TODO very messy -- would be nice to do this with less queries
|
||||
# tags are annoying... because they are in one-to-many relationship, hard to retrieve in sqlite..
|
||||
iid = r['itemID']
|
||||
tags = [row[0] for row in conn.execute(_QUERY_TAGS, [iid])]
|
||||
r['tags'] = tags
|
||||
|
||||
# TODO also need item tags
|
||||
|
||||
pid = r['parentItemID']
|
||||
[title] = [row[0] for row in conn.execute(_QUERY_TITLE, [pid])]
|
||||
r['title'] = title
|
||||
|
||||
murl = [row[0] for row in conn.execute(_QUERY_URL, [pid])]
|
||||
url = None if len(murl) == 0 else murl[0]
|
||||
r['url'] = url
|
||||
return r
|
||||
|
||||
|
||||
def _parse_annotation(r: Dict) -> Annotation:
|
||||
text = r['text']
|
||||
comment = r['comment']
|
||||
# todo use json query for this?
|
||||
page = json.loads(r['position'])['pageIndex']
|
||||
path = r['path']
|
||||
addeds = r['dateAdded']
|
||||
tags = r['tags']
|
||||
|
||||
added = datetime.strptime(addeds, '%Y-%m-%d %H:%M:%S')
|
||||
added = added.replace(tzinfo=timezone.utc)
|
||||
|
||||
item = Item(
|
||||
file=Path(path), # path is a bit misleading... could mean some internal DOM path?
|
||||
title=r['title'],
|
||||
url=r['url'],
|
||||
)
|
||||
|
||||
return Annotation(
|
||||
item=item,
|
||||
added=added,
|
||||
page=page,
|
||||
text=text,
|
||||
comment=comment,
|
||||
tags=tags,
|
||||
)
|
Loading…
Add table
Reference in a new issue