better extraction
This commit is contained in:
parent
58eb036e5a
commit
af2590ab43
1 changed files with 103 additions and 47 deletions
150
goodreads/__init__.py
Normal file → Executable file
150
goodreads/__init__.py
Normal file → Executable file
|
@ -1,8 +1,10 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, NamedTuple
|
from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import pytz
|
||||||
|
|
||||||
from xml.dom.minidom import parseString # type: ignore
|
from xml.dom.minidom import parseString, Element # type: ignore
|
||||||
|
|
||||||
BPATH = Path("/L/backups/goodreads")
|
BPATH = Path("/L/backups/goodreads")
|
||||||
|
|
||||||
|
@ -25,54 +27,77 @@ def get_reviews():
|
||||||
xmls.append(parseString(xx + _SP))
|
xmls.append(parseString(xx + _SP))
|
||||||
return xmls
|
return xmls
|
||||||
|
|
||||||
|
class Book(NamedTuple):
|
||||||
|
bid: str
|
||||||
|
title: str
|
||||||
|
authors: Sequence[str]
|
||||||
|
shelves: Sequence[str]
|
||||||
|
date_added: datetime
|
||||||
|
date_started: Optional[datetime]
|
||||||
|
date_read: Optional[datetime]
|
||||||
|
|
||||||
def get_books():
|
from kython import the
|
||||||
books = []
|
|
||||||
|
|
||||||
|
def _parse_date(s: Optional[str]) -> Optional[datetime]:
|
||||||
|
if s is None:
|
||||||
|
return None
|
||||||
|
res = datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y")
|
||||||
|
assert res.tzinfo is not None
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def iter_books() -> Iterator[Book]:
|
||||||
for review in get_reviews():
|
for review in get_reviews():
|
||||||
book_element = review.getElementsByTagName('book')[0]
|
review_xml = the(review.childNodes)
|
||||||
title_element = book_element.getElementsByTagName('title')[0]
|
rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)}
|
||||||
id_element = book_element.getElementsByTagName('id')[0]
|
# fuck xml...
|
||||||
isbn_element = book_element.getElementsByTagName('isbn')[0]
|
|
||||||
isbn13_element = book_element.getElementsByTagName('isbn13')[0]
|
|
||||||
date_added = review.getElementsByTagName('date_added')[0]
|
|
||||||
started_at = review.getElementsByTagName('started_at')[0]
|
|
||||||
read_at = review.getElementsByTagName('read_at')[0]
|
|
||||||
|
|
||||||
shelves_element = review.getElementsByTagName('shelves')[0]
|
book_element = rdict['book']
|
||||||
|
title = the(the(book_element.getElementsByTagName('title')).childNodes).data
|
||||||
|
|
||||||
|
id_element = rdict['id']
|
||||||
|
# isbn_element = the(book_element.getElementsByTagName('isbn'))
|
||||||
|
# isbn13_element = the(book_element.getElementsByTagName('isbn13'))
|
||||||
|
date_added = the(rdict['date_added'].childNodes).data
|
||||||
|
sss = rdict['started_at'].childNodes
|
||||||
|
rrr = rdict['read_at'].childNodes
|
||||||
|
started_at = None if len(sss) == 0 else the(sss).data
|
||||||
|
read_at = None if len(rrr) == 0 else the(rrr).data
|
||||||
|
|
||||||
|
shelves_element = rdict['shelves']
|
||||||
book_shelves = []
|
book_shelves = []
|
||||||
for shelf in shelves_element.getElementsByTagName('shelf'):
|
for shelf in shelves_element.getElementsByTagName('shelf'):
|
||||||
book_shelves.append(shelf.getAttribute('name'))
|
book_shelves.append(shelf.getAttribute('name'))
|
||||||
|
|
||||||
book = {
|
# if isbn_element.getAttribute('nil') != 'true':
|
||||||
'title': title_element.firstChild.data,
|
# book['isbn'] = isbn_element.firstChild.data
|
||||||
'id': id_element.firstChild.data,
|
# else:
|
||||||
'shelves': book_shelves
|
# book['isbn'] = ''
|
||||||
}
|
|
||||||
|
|
||||||
if isbn_element.getAttribute('nil') != 'true':
|
# if isbn13_element.getAttribute('nil') != 'true':
|
||||||
book['isbn'] = isbn_element.firstChild.data
|
# book['isbn13'] = isbn13_element.firstChild.data
|
||||||
else:
|
# else:
|
||||||
book['isbn'] = ''
|
# book['isbn13'] = ''
|
||||||
|
|
||||||
if isbn13_element.getAttribute('nil') != 'true':
|
da = _parse_date(date_added)
|
||||||
book['isbn13'] = isbn13_element.firstChild.data
|
assert da is not None
|
||||||
else:
|
yield Book(
|
||||||
book['isbn13'] = ''
|
bid=id_element.firstChild.data,
|
||||||
|
title=title,
|
||||||
|
shelves=book_shelves,
|
||||||
|
date_added=da,
|
||||||
|
date_started=_parse_date(started_at),
|
||||||
|
date_read=_parse_date(read_at),
|
||||||
|
)
|
||||||
|
|
||||||
if started_at.firstChild is not None:
|
def get_books():
|
||||||
book['started_at'] = started_at.firstChild.data
|
return list(iter_books())
|
||||||
else:
|
|
||||||
book['started_at'] = ''
|
|
||||||
|
|
||||||
if read_at.firstChild is not None:
|
|
||||||
book['read_at'] = read_at.firstChild.data
|
|
||||||
else:
|
|
||||||
book['read_at'] = ''
|
|
||||||
|
|
||||||
book['date_added'] = None if date_added.firstChild is None else date_added.firstChild.data
|
def test_books():
|
||||||
|
books = get_books()
|
||||||
books.append(book)
|
assert len(books) > 10
|
||||||
return books
|
|
||||||
|
|
||||||
|
|
||||||
class Event(NamedTuple):
|
class Event(NamedTuple):
|
||||||
|
@ -81,19 +106,13 @@ class Event(NamedTuple):
|
||||||
eid: str
|
eid: str
|
||||||
|
|
||||||
|
|
||||||
def _parse_date(s: str) -> datetime:
|
|
||||||
return datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y")
|
|
||||||
|
|
||||||
|
|
||||||
def get_events():
|
def get_events():
|
||||||
events = []
|
events = []
|
||||||
for b in get_books():
|
for b in get_books():
|
||||||
added = _parse_date(b['date_added'])
|
|
||||||
title = b['title']
|
|
||||||
events.append(Event(
|
events.append(Event(
|
||||||
dt=added,
|
dt=b.date_added,
|
||||||
summary=f'Added book "{title}"', # TODO shelf?
|
summary=f'Added book "{b.title}"', # TODO shelf?
|
||||||
eid=b['id'],
|
eid=b.bid
|
||||||
))
|
))
|
||||||
# TODO finished? other updates?
|
# TODO finished? other updates?
|
||||||
return sorted(events, key=lambda e: e.dt)
|
return sorted(events, key=lambda e: e.dt)
|
||||||
|
@ -101,3 +120,40 @@ def get_events():
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
assert len(get_events()) > 20
|
assert len(get_events()) > 20
|
||||||
|
|
||||||
|
|
||||||
|
def print_read_history():
|
||||||
|
def key(b):
|
||||||
|
read = b.date_read
|
||||||
|
if read is None:
|
||||||
|
return datetime.fromtimestamp(0, pytz.utc)
|
||||||
|
else:
|
||||||
|
return read
|
||||||
|
|
||||||
|
def fmtdt(dt):
|
||||||
|
if dt is None:
|
||||||
|
return dt
|
||||||
|
tz = pytz.timezone('Europe/London')
|
||||||
|
return dt.astimezone(tz)
|
||||||
|
for b in sorted(iter_books(), key=key):
|
||||||
|
print(b.title)
|
||||||
|
print(f' started : {fmtdt(b.date_started)}')
|
||||||
|
print(f' finished: {fmtdt(b.date_read)}')
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
sp = p.add_argument('mode', nargs='?')
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
if args.mode == 'history':
|
||||||
|
print_read_history()
|
||||||
|
else:
|
||||||
|
assert args.mode is None
|
||||||
|
for b in iter_books():
|
||||||
|
print(b)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
Loading…
Add table
Reference in a new issue