better extraction

This commit is contained in:
Dima Gerasimov 2019-05-18 09:54:01 +01:00
parent 58eb036e5a
commit af2590ab43

150
goodreads/__init__.py Normal file → Executable file
View file

@ -1,8 +1,10 @@
#!/usr/bin/env python3
from pathlib import Path
from typing import List, Dict, NamedTuple
from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence
from datetime import datetime
import pytz
from xml.dom.minidom import parseString # type: ignore
from xml.dom.minidom import parseString, Element # type: ignore
BPATH = Path("/L/backups/goodreads")
@ -25,54 +27,77 @@ def get_reviews():
xmls.append(parseString(xx + _SP))
return xmls
class Book(NamedTuple):
bid: str
title: str
authors: Sequence[str]
shelves: Sequence[str]
date_added: datetime
date_started: Optional[datetime]
date_read: Optional[datetime]
def get_books():
books = []
from kython import the
def _parse_date(s: Optional[str]) -> Optional[datetime]:
if s is None:
return None
res = datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y")
assert res.tzinfo is not None
return res
def iter_books() -> Iterator[Book]:
for review in get_reviews():
book_element = review.getElementsByTagName('book')[0]
title_element = book_element.getElementsByTagName('title')[0]
id_element = book_element.getElementsByTagName('id')[0]
isbn_element = book_element.getElementsByTagName('isbn')[0]
isbn13_element = book_element.getElementsByTagName('isbn13')[0]
date_added = review.getElementsByTagName('date_added')[0]
started_at = review.getElementsByTagName('started_at')[0]
read_at = review.getElementsByTagName('read_at')[0]
review_xml = the(review.childNodes)
rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)}
# fuck xml...
shelves_element = review.getElementsByTagName('shelves')[0]
book_element = rdict['book']
title = the(the(book_element.getElementsByTagName('title')).childNodes).data
id_element = rdict['id']
# isbn_element = the(book_element.getElementsByTagName('isbn'))
# isbn13_element = the(book_element.getElementsByTagName('isbn13'))
date_added = the(rdict['date_added'].childNodes).data
sss = rdict['started_at'].childNodes
rrr = rdict['read_at'].childNodes
started_at = None if len(sss) == 0 else the(sss).data
read_at = None if len(rrr) == 0 else the(rrr).data
shelves_element = rdict['shelves']
book_shelves = []
for shelf in shelves_element.getElementsByTagName('shelf'):
book_shelves.append(shelf.getAttribute('name'))
book = {
'title': title_element.firstChild.data,
'id': id_element.firstChild.data,
'shelves': book_shelves
}
# if isbn_element.getAttribute('nil') != 'true':
# book['isbn'] = isbn_element.firstChild.data
# else:
# book['isbn'] = ''
if isbn_element.getAttribute('nil') != 'true':
book['isbn'] = isbn_element.firstChild.data
else:
book['isbn'] = ''
# if isbn13_element.getAttribute('nil') != 'true':
# book['isbn13'] = isbn13_element.firstChild.data
# else:
# book['isbn13'] = ''
if isbn13_element.getAttribute('nil') != 'true':
book['isbn13'] = isbn13_element.firstChild.data
else:
book['isbn13'] = ''
da = _parse_date(date_added)
assert da is not None
yield Book(
bid=id_element.firstChild.data,
title=title,
shelves=book_shelves,
date_added=da,
date_started=_parse_date(started_at),
date_read=_parse_date(read_at),
)
if started_at.firstChild is not None:
book['started_at'] = started_at.firstChild.data
else:
book['started_at'] = ''
def get_books():
return list(iter_books())
if read_at.firstChild is not None:
book['read_at'] = read_at.firstChild.data
else:
book['read_at'] = ''
book['date_added'] = None if date_added.firstChild is None else date_added.firstChild.data
books.append(book)
return books
def test_books():
books = get_books()
assert len(books) > 10
class Event(NamedTuple):
@ -81,19 +106,13 @@ class Event(NamedTuple):
eid: str
def _parse_date(s: str) -> datetime:
return datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y")
def get_events():
events = []
for b in get_books():
added = _parse_date(b['date_added'])
title = b['title']
events.append(Event(
dt=added,
summary=f'Added book "{title}"', # TODO shelf?
eid=b['id'],
dt=b.date_added,
summary=f'Added book "{b.title}"', # TODO shelf?
eid=b.bid
))
# TODO finished? other updates?
return sorted(events, key=lambda e: e.dt)
@ -101,3 +120,40 @@ def get_events():
def test():
assert len(get_events()) > 20
def print_read_history():
def key(b):
read = b.date_read
if read is None:
return datetime.fromtimestamp(0, pytz.utc)
else:
return read
def fmtdt(dt):
if dt is None:
return dt
tz = pytz.timezone('Europe/London')
return dt.astimezone(tz)
for b in sorted(iter_books(), key=key):
print(b.title)
print(f' started : {fmtdt(b.date_started)}')
print(f' finished: {fmtdt(b.date_read)}')
print()
def main():
import argparse
p = argparse.ArgumentParser()
sp = p.add_argument('mode', nargs='?')
args = p.parse_args()
if args.mode == 'history':
print_read_history()
else:
assert args.mode is None
for b in iter_books():
print(b)
if __name__ == '__main__':
main()