nicer extraction, switch to lxml
This commit is contained in:
parent
af2590ab43
commit
9e7a2b895c
1 changed files with 23 additions and 23 deletions
|
@ -4,7 +4,7 @@ from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
from xml.dom.minidom import parseString, Element # type: ignore
|
from lxml import etree as ET # type: ignore
|
||||||
|
|
||||||
BPATH = Path("/L/backups/goodreads")
|
BPATH = Path("/L/backups/goodreads")
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ def get_reviews():
|
||||||
for xx in data.split(_SP):
|
for xx in data.split(_SP):
|
||||||
if len(xx.strip()) == 0:
|
if len(xx.strip()) == 0:
|
||||||
break
|
break
|
||||||
xmls.append(parseString(xx + _SP))
|
xmls.append(ET.fromstring(xx + _SP))
|
||||||
return xmls
|
return xmls
|
||||||
|
|
||||||
class Book(NamedTuple):
|
class Book(NamedTuple):
|
||||||
|
@ -48,27 +48,25 @@ def _parse_date(s: Optional[str]) -> Optional[datetime]:
|
||||||
|
|
||||||
|
|
||||||
def iter_books() -> Iterator[Book]:
|
def iter_books() -> Iterator[Book]:
|
||||||
for review in get_reviews():
|
for r in get_reviews():
|
||||||
review_xml = the(review.childNodes)
|
# review_xml = the(review.childNodes)
|
||||||
rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)}
|
# rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)}
|
||||||
# fuck xml...
|
# fuck xml...
|
||||||
|
|
||||||
book_element = rdict['book']
|
be = the(r.xpath('book'))
|
||||||
title = the(the(book_element.getElementsByTagName('title')).childNodes).data
|
title = the(be.xpath('title/text()'))
|
||||||
|
authors = be.xpath('authors/author/name/text()')
|
||||||
|
|
||||||
id_element = rdict['id']
|
bid = the(r.xpath('id/text()'))
|
||||||
# isbn_element = the(book_element.getElementsByTagName('isbn'))
|
# isbn_element = the(book_element.getElementsByTagName('isbn'))
|
||||||
# isbn13_element = the(book_element.getElementsByTagName('isbn13'))
|
# isbn13_element = the(book_element.getElementsByTagName('isbn13'))
|
||||||
date_added = the(rdict['date_added'].childNodes).data
|
date_added = the(r.xpath('date_added/text()'))
|
||||||
sss = rdict['started_at'].childNodes
|
sss = r.xpath('started_at/text()')
|
||||||
rrr = rdict['read_at'].childNodes
|
rrr = r.xpath('read_at/text()')
|
||||||
started_at = None if len(sss) == 0 else the(sss).data
|
started_at = None if len(sss) == 0 else the(sss)
|
||||||
read_at = None if len(rrr) == 0 else the(rrr).data
|
read_at = None if len(rrr) == 0 else the(rrr)
|
||||||
|
|
||||||
shelves_element = rdict['shelves']
|
shelves = r.xpath('shelves/shelf/name/text()')
|
||||||
book_shelves = []
|
|
||||||
for shelf in shelves_element.getElementsByTagName('shelf'):
|
|
||||||
book_shelves.append(shelf.getAttribute('name'))
|
|
||||||
|
|
||||||
# if isbn_element.getAttribute('nil') != 'true':
|
# if isbn_element.getAttribute('nil') != 'true':
|
||||||
# book['isbn'] = isbn_element.firstChild.data
|
# book['isbn'] = isbn_element.firstChild.data
|
||||||
|
@ -83,9 +81,10 @@ def iter_books() -> Iterator[Book]:
|
||||||
da = _parse_date(date_added)
|
da = _parse_date(date_added)
|
||||||
assert da is not None
|
assert da is not None
|
||||||
yield Book(
|
yield Book(
|
||||||
bid=id_element.firstChild.data,
|
bid=bid,
|
||||||
title=title,
|
title=title,
|
||||||
shelves=book_shelves,
|
authors=authors,
|
||||||
|
shelves=shelves,
|
||||||
date_added=da,
|
date_added=da,
|
||||||
date_started=_parse_date(started_at),
|
date_started=_parse_date(started_at),
|
||||||
date_read=_parse_date(read_at),
|
date_read=_parse_date(read_at),
|
||||||
|
@ -136,10 +135,11 @@ def print_read_history():
|
||||||
tz = pytz.timezone('Europe/London')
|
tz = pytz.timezone('Europe/London')
|
||||||
return dt.astimezone(tz)
|
return dt.astimezone(tz)
|
||||||
for b in sorted(iter_books(), key=key):
|
for b in sorted(iter_books(), key=key):
|
||||||
print(b.title)
|
print(f"""
|
||||||
print(f' started : {fmtdt(b.date_started)}')
|
{b.title} by {', '.join(b.authors)}
|
||||||
print(f' finished: {fmtdt(b.date_read)}')
|
started : {fmtdt(b.date_started)}
|
||||||
print()
|
finished: {fmtdt(b.date_read)}
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Add table
Reference in a new issue