my.pdfs: rely on pdfannots for created date extraction/parsing
This commit is contained in:
parent
ad177a1ccd
commit
edf6e5d50b
1 changed files with 14 additions and 21 deletions
35
my/pdfs.py
35
my/pdfs.py
|
@ -71,7 +71,12 @@ class Annotation(NamedTuple):
|
||||||
page: int
|
page: int
|
||||||
highlight: Optional[str]
|
highlight: Optional[str]
|
||||||
comment: Optional[str]
|
comment: Optional[str]
|
||||||
date: Optional[datetime] # TODO tz aware/unaware?
|
created: Optional[datetime] # note: can be tz unaware in some bad pdfs...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def date(self) -> Optional[datetime]:
|
||||||
|
# legacy name
|
||||||
|
return self.created
|
||||||
|
|
||||||
|
|
||||||
def as_annotation(*, raw_ann, path: str) -> Annotation:
|
def as_annotation(*, raw_ann, path: str) -> Annotation:
|
||||||
|
@ -80,30 +85,13 @@ def as_annotation(*, raw_ann, path: str) -> Annotation:
|
||||||
for a in ('boxes', 'rect'):
|
for a in ('boxes', 'rect'):
|
||||||
if a in d:
|
if a in d:
|
||||||
del d[a]
|
del d[a]
|
||||||
dates = d.get('date')
|
|
||||||
date: Optional[datetime] = None
|
|
||||||
if dates is not None:
|
|
||||||
dates = dates.replace("'", "")
|
|
||||||
# 20190630213504+0100
|
|
||||||
dates = re.sub('Z0000$', '+0000', dates)
|
|
||||||
FMT = '%Y%m%d%H%M%S'
|
|
||||||
# TODO is it utc if there is not timestamp?
|
|
||||||
for fmt in [FMT, FMT + '%z']:
|
|
||||||
try:
|
|
||||||
date = datetime.strptime(dates, fmt)
|
|
||||||
break
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# TODO defensive?
|
|
||||||
raise RuntimeError(dates)
|
|
||||||
return Annotation(
|
return Annotation(
|
||||||
path = path,
|
path = path,
|
||||||
author = d['author'],
|
author = d['author'],
|
||||||
page = d['page'],
|
page = d['page'],
|
||||||
highlight = d['text'],
|
highlight = d['text'],
|
||||||
comment = d['contents'],
|
comment = d['contents'],
|
||||||
date = date,
|
created = d.get('created'), # todo can be non-defensive once pr is merged
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -167,10 +155,15 @@ class Pdf(NamedTuple):
|
||||||
path: Path
|
path: Path
|
||||||
annotations: Sequence[Annotation]
|
annotations: Sequence[Annotation]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def created(self) -> Optional[datetime]:
|
||||||
|
annots = self.annotations
|
||||||
|
return None if len(annots) == 0 else annots[-1].created
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def date(self) -> Optional[datetime]:
|
def date(self) -> Optional[datetime]:
|
||||||
# TODO tz aware/unaware
|
# legacy
|
||||||
return self.annotations[-1].date
|
return self.created
|
||||||
|
|
||||||
|
|
||||||
def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
|
def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
|
||||||
|
|
Loading…
Add table
Reference in a new issue