From edf6e5d50bf244c7c0078fd26640e083c42f884e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 31 Mar 2021 20:56:42 +0100 Subject: [PATCH] my.pdfs: rely on pdfannots for created date extraction/parsing --- my/pdfs.py | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/my/pdfs.py b/my/pdfs.py index cfe630e..c537a94 100755 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -71,7 +71,12 @@ class Annotation(NamedTuple): page: int highlight: Optional[str] comment: Optional[str] - date: Optional[datetime] # TODO tz aware/unaware? + created: Optional[datetime] # note: can be tz unaware in some bad pdfs... + + @property + def date(self) -> Optional[datetime]: + # legacy name + return self.created def as_annotation(*, raw_ann, path: str) -> Annotation: @@ -80,30 +85,13 @@ def as_annotation(*, raw_ann, path: str) -> Annotation: for a in ('boxes', 'rect'): if a in d: del d[a] - dates = d.get('date') - date: Optional[datetime] = None - if dates is not None: - dates = dates.replace("'", "") - # 20190630213504+0100 - dates = re.sub('Z0000$', '+0000', dates) - FMT = '%Y%m%d%H%M%S' - # TODO is it utc if there is not timestamp? - for fmt in [FMT, FMT + '%z']: - try: - date = datetime.strptime(dates, fmt) - break - except ValueError: - pass - else: - # TODO defensive? - raise RuntimeError(dates) return Annotation( path = path, author = d['author'], page = d['page'], highlight = d['text'], comment = d['contents'], - date = date, + created = d.get('created'), # todo can be non-defensive once pr is merged ) @@ -167,10 +155,15 @@ class Pdf(NamedTuple): path: Path annotations: Sequence[Annotation] + @property + def created(self) -> Optional[datetime]: + annots = self.annotations + return None if len(annots) == 0 else annots[-1].created + @property def date(self) -> Optional[datetime]: - # TODO tz aware/unaware - return self.annotations[-1].date + # legacy + return self.created def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]: