nicer extraction, switch to lxml

This commit is contained in:
Dima Gerasimov 2019-05-18 10:12:18 +01:00
parent af2590ab43
commit 9e7a2b895c

View file

@ -4,7 +4,7 @@ from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence
from datetime import datetime from datetime import datetime
import pytz import pytz
from xml.dom.minidom import parseString, Element # type: ignore from lxml import etree as ET # type: ignore
BPATH = Path("/L/backups/goodreads") BPATH = Path("/L/backups/goodreads")
@ -24,7 +24,7 @@ def get_reviews():
for xx in data.split(_SP): for xx in data.split(_SP):
if len(xx.strip()) == 0: if len(xx.strip()) == 0:
break break
xmls.append(parseString(xx + _SP)) xmls.append(ET.fromstring(xx + _SP))
return xmls return xmls
class Book(NamedTuple): class Book(NamedTuple):
@ -48,27 +48,25 @@ def _parse_date(s: Optional[str]) -> Optional[datetime]:
def iter_books() -> Iterator[Book]: def iter_books() -> Iterator[Book]:
for review in get_reviews(): for r in get_reviews():
review_xml = the(review.childNodes) # review_xml = the(review.childNodes)
rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)} # rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)}
# fuck xml... # fuck xml...
book_element = rdict['book'] be = the(r.xpath('book'))
title = the(the(book_element.getElementsByTagName('title')).childNodes).data title = the(be.xpath('title/text()'))
authors = be.xpath('authors/author/name/text()')
id_element = rdict['id'] bid = the(r.xpath('id/text()'))
# isbn_element = the(book_element.getElementsByTagName('isbn')) # isbn_element = the(book_element.getElementsByTagName('isbn'))
# isbn13_element = the(book_element.getElementsByTagName('isbn13')) # isbn13_element = the(book_element.getElementsByTagName('isbn13'))
date_added = the(rdict['date_added'].childNodes).data date_added = the(r.xpath('date_added/text()'))
sss = rdict['started_at'].childNodes sss = r.xpath('started_at/text()')
rrr = rdict['read_at'].childNodes rrr = r.xpath('read_at/text()')
started_at = None if len(sss) == 0 else the(sss).data started_at = None if len(sss) == 0 else the(sss)
read_at = None if len(rrr) == 0 else the(rrr).data read_at = None if len(rrr) == 0 else the(rrr)
shelves_element = rdict['shelves'] shelves = r.xpath('shelves/shelf/name/text()')
book_shelves = []
for shelf in shelves_element.getElementsByTagName('shelf'):
book_shelves.append(shelf.getAttribute('name'))
# if isbn_element.getAttribute('nil') != 'true': # if isbn_element.getAttribute('nil') != 'true':
# book['isbn'] = isbn_element.firstChild.data # book['isbn'] = isbn_element.firstChild.data
@ -83,9 +81,10 @@ def iter_books() -> Iterator[Book]:
da = _parse_date(date_added) da = _parse_date(date_added)
assert da is not None assert da is not None
yield Book( yield Book(
bid=id_element.firstChild.data, bid=bid,
title=title, title=title,
shelves=book_shelves, authors=authors,
shelves=shelves,
date_added=da, date_added=da,
date_started=_parse_date(started_at), date_started=_parse_date(started_at),
date_read=_parse_date(read_at), date_read=_parse_date(read_at),
@ -136,10 +135,11 @@ def print_read_history():
tz = pytz.timezone('Europe/London') tz = pytz.timezone('Europe/London')
return dt.astimezone(tz) return dt.astimezone(tz)
for b in sorted(iter_books(), key=key): for b in sorted(iter_books(), key=key):
print(b.title) print(f"""
print(f' started : {fmtdt(b.date_started)}') {b.title} by {', '.join(b.authors)}
print(f' finished: {fmtdt(b.date_read)}') started : {fmtdt(b.date_started)}
print() finished: {fmtdt(b.date_read)}
""")
def main(): def main():