my.pdfs: rely on pdfannots for created date extraction/parsing

This commit is contained in:
Dima Gerasimov 2021-03-31 20:56:42 +01:00 committed by karlicoss
parent ad177a1ccd
commit edf6e5d50b

View file

@ -71,7 +71,12 @@ class Annotation(NamedTuple):
page: int page: int
highlight: Optional[str] highlight: Optional[str]
comment: Optional[str] comment: Optional[str]
date: Optional[datetime] # TODO tz aware/unaware? created: Optional[datetime] # note: can be tz unaware in some bad pdfs...
@property
def date(self) -> Optional[datetime]:
# legacy name
return self.created
def as_annotation(*, raw_ann, path: str) -> Annotation: def as_annotation(*, raw_ann, path: str) -> Annotation:
@ -80,30 +85,13 @@ def as_annotation(*, raw_ann, path: str) -> Annotation:
for a in ('boxes', 'rect'): for a in ('boxes', 'rect'):
if a in d: if a in d:
del d[a] del d[a]
dates = d.get('date')
date: Optional[datetime] = None
if dates is not None:
dates = dates.replace("'", "")
# 20190630213504+0100
dates = re.sub('Z0000$', '+0000', dates)
FMT = '%Y%m%d%H%M%S'
# TODO is it utc if there is not timestamp?
for fmt in [FMT, FMT + '%z']:
try:
date = datetime.strptime(dates, fmt)
break
except ValueError:
pass
else:
# TODO defensive?
raise RuntimeError(dates)
return Annotation( return Annotation(
path = path, path = path,
author = d['author'], author = d['author'],
page = d['page'], page = d['page'],
highlight = d['text'], highlight = d['text'],
comment = d['contents'], comment = d['contents'],
date = date, created = d.get('created'), # todo can be non-defensive once pr is merged
) )
@ -167,10 +155,15 @@ class Pdf(NamedTuple):
path: Path path: Path
annotations: Sequence[Annotation] annotations: Sequence[Annotation]
@property
def created(self) -> Optional[datetime]:
annots = self.annotations
return None if len(annots) == 0 else annots[-1].created
@property @property
def date(self) -> Optional[datetime]: def date(self) -> Optional[datetime]:
# TODO tz aware/unaware # legacy
return self.annotations[-1].date return self.created
def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]: def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]: