From af2590ab4313df166a3f8f57e64e5908b0601baf Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 18 May 2019 09:54:01 +0100 Subject: [PATCH] better extraction --- goodreads/__init__.py | 150 +++++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 47 deletions(-) mode change 100644 => 100755 goodreads/__init__.py diff --git a/goodreads/__init__.py b/goodreads/__init__.py old mode 100644 new mode 100755 index 09a7aa8..9a2506d --- a/goodreads/__init__.py +++ b/goodreads/__init__.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python3 from pathlib import Path -from typing import List, Dict, NamedTuple +from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence from datetime import datetime +import pytz -from xml.dom.minidom import parseString # type: ignore +from xml.dom.minidom import parseString, Element # type: ignore BPATH = Path("/L/backups/goodreads") @@ -25,54 +27,77 @@ def get_reviews(): xmls.append(parseString(xx + _SP)) return xmls +class Book(NamedTuple): + bid: str + title: str + authors: Sequence[str] + shelves: Sequence[str] + date_added: datetime + date_started: Optional[datetime] + date_read: Optional[datetime] -def get_books(): - books = [] +from kython import the + + +def _parse_date(s: Optional[str]) -> Optional[datetime]: + if s is None: + return None + res = datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y") + assert res.tzinfo is not None + return res + + +def iter_books() -> Iterator[Book]: for review in get_reviews(): - book_element = review.getElementsByTagName('book')[0] - title_element = book_element.getElementsByTagName('title')[0] - id_element = book_element.getElementsByTagName('id')[0] - isbn_element = book_element.getElementsByTagName('isbn')[0] - isbn13_element = book_element.getElementsByTagName('isbn13')[0] - date_added = review.getElementsByTagName('date_added')[0] - started_at = review.getElementsByTagName('started_at')[0] - read_at = review.getElementsByTagName('read_at')[0] + review_xml = the(review.childNodes) + rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)} + # fuck xml... - shelves_element = review.getElementsByTagName('shelves')[0] + book_element = rdict['book'] + title = the(the(book_element.getElementsByTagName('title')).childNodes).data + + id_element = rdict['id'] + # isbn_element = the(book_element.getElementsByTagName('isbn')) + # isbn13_element = the(book_element.getElementsByTagName('isbn13')) + date_added = the(rdict['date_added'].childNodes).data + sss = rdict['started_at'].childNodes + rrr = rdict['read_at'].childNodes + started_at = None if len(sss) == 0 else the(sss).data + read_at = None if len(rrr) == 0 else the(rrr).data + + shelves_element = rdict['shelves'] book_shelves = [] for shelf in shelves_element.getElementsByTagName('shelf'): book_shelves.append(shelf.getAttribute('name')) - book = { - 'title': title_element.firstChild.data, - 'id': id_element.firstChild.data, - 'shelves': book_shelves - } + # if isbn_element.getAttribute('nil') != 'true': + # book['isbn'] = isbn_element.firstChild.data + # else: + # book['isbn'] = '' - if isbn_element.getAttribute('nil') != 'true': - book['isbn'] = isbn_element.firstChild.data - else: - book['isbn'] = '' + # if isbn13_element.getAttribute('nil') != 'true': + # book['isbn13'] = isbn13_element.firstChild.data + # else: + # book['isbn13'] = '' - if isbn13_element.getAttribute('nil') != 'true': - book['isbn13'] = isbn13_element.firstChild.data - else: - book['isbn13'] = '' + da = _parse_date(date_added) + assert da is not None + yield Book( + bid=id_element.firstChild.data, + title=title, + shelves=book_shelves, + date_added=da, + date_started=_parse_date(started_at), + date_read=_parse_date(read_at), + ) - if started_at.firstChild is not None: - book['started_at'] = started_at.firstChild.data - else: - book['started_at'] = '' +def get_books(): + return list(iter_books()) - if read_at.firstChild is not None: - book['read_at'] = read_at.firstChild.data - else: - book['read_at'] = '' - book['date_added'] = None if date_added.firstChild is None else date_added.firstChild.data - - books.append(book) - return books +def test_books(): + books = get_books() + assert len(books) > 10 class Event(NamedTuple): @@ -81,19 +106,13 @@ class Event(NamedTuple): eid: str -def _parse_date(s: str) -> datetime: - return datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y") - - def get_events(): events = [] for b in get_books(): - added = _parse_date(b['date_added']) - title = b['title'] events.append(Event( - dt=added, - summary=f'Added book "{title}"', # TODO shelf? - eid=b['id'], + dt=b.date_added, + summary=f'Added book "{b.title}"', # TODO shelf? + eid=b.bid )) # TODO finished? other updates? return sorted(events, key=lambda e: e.dt) @@ -101,3 +120,40 @@ def get_events(): def test(): assert len(get_events()) > 20 + + +def print_read_history(): + def key(b): + read = b.date_read + if read is None: + return datetime.fromtimestamp(0, pytz.utc) + else: + return read + + def fmtdt(dt): + if dt is None: + return dt + tz = pytz.timezone('Europe/London') + return dt.astimezone(tz) + for b in sorted(iter_books(), key=key): + print(b.title) + print(f' started : {fmtdt(b.date_started)}') + print(f' finished: {fmtdt(b.date_read)}') + print() + + +def main(): + import argparse + p = argparse.ArgumentParser() + sp = p.add_argument('mode', nargs='?') + args = p.parse_args() + + if args.mode == 'history': + print_read_history() + else: + assert args.mode is None + for b in iter_books(): + print(b) + +if __name__ == '__main__': + main()