my.pdfs: rely on pdfannots for created date extraction/parsing
This commit is contained in:
parent
ad177a1ccd
commit
edf6e5d50b
1 changed files with 14 additions and 21 deletions
35
my/pdfs.py
35
my/pdfs.py
|
@ -71,7 +71,12 @@ class Annotation(NamedTuple):
|
|||
page: int
|
||||
highlight: Optional[str]
|
||||
comment: Optional[str]
|
||||
date: Optional[datetime] # TODO tz aware/unaware?
|
||||
created: Optional[datetime] # note: can be tz unaware in some bad pdfs...
|
||||
|
||||
@property
|
||||
def date(self) -> Optional[datetime]:
|
||||
# legacy name
|
||||
return self.created
|
||||
|
||||
|
||||
def as_annotation(*, raw_ann, path: str) -> Annotation:
|
||||
|
@ -80,30 +85,13 @@ def as_annotation(*, raw_ann, path: str) -> Annotation:
|
|||
for a in ('boxes', 'rect'):
|
||||
if a in d:
|
||||
del d[a]
|
||||
dates = d.get('date')
|
||||
date: Optional[datetime] = None
|
||||
if dates is not None:
|
||||
dates = dates.replace("'", "")
|
||||
# 20190630213504+0100
|
||||
dates = re.sub('Z0000$', '+0000', dates)
|
||||
FMT = '%Y%m%d%H%M%S'
|
||||
# TODO is it utc if there is not timestamp?
|
||||
for fmt in [FMT, FMT + '%z']:
|
||||
try:
|
||||
date = datetime.strptime(dates, fmt)
|
||||
break
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
# TODO defensive?
|
||||
raise RuntimeError(dates)
|
||||
return Annotation(
|
||||
path = path,
|
||||
author = d['author'],
|
||||
page = d['page'],
|
||||
highlight = d['text'],
|
||||
comment = d['contents'],
|
||||
date = date,
|
||||
created = d.get('created'), # todo can be non-defensive once pr is merged
|
||||
)
|
||||
|
||||
|
||||
|
@ -167,10 +155,15 @@ class Pdf(NamedTuple):
|
|||
path: Path
|
||||
annotations: Sequence[Annotation]
|
||||
|
||||
@property
|
||||
def created(self) -> Optional[datetime]:
|
||||
annots = self.annotations
|
||||
return None if len(annots) == 0 else annots[-1].created
|
||||
|
||||
@property
|
||||
def date(self) -> Optional[datetime]:
|
||||
# TODO tz aware/unaware
|
||||
return self.annotations[-1].date
|
||||
# legacy
|
||||
return self.created
|
||||
|
||||
|
||||
def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
|
||||
|
|
Loading…
Add table
Reference in a new issue