From f6a32557295b9c86e446d3f08d4b333dfa9060ea Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 5 Sep 2018 15:12:27 +0400 Subject: [PATCH 1/5] Initial --- .gitignore | 178 ++++++++++++++++++++++++++++++++++++++++++ ci.sh | 10 +++ goodreads/__init__.py | 104 ++++++++++++++++++++++++ goodreads/__main__.py | 0 run | 6 ++ 5 files changed, 298 insertions(+) create mode 100644 .gitignore create mode 100755 ci.sh create mode 100644 goodreads/__init__.py create mode 100644 goodreads/__main__.py create mode 100755 run diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..236a11e --- /dev/null +++ b/.gitignore @@ -0,0 +1,178 @@ + +# Created by https://www.gitignore.io/api/python,emacs + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +### Python Patch ### +.venv/ + +### Python.VirtualEnv Stack ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + + +# End of https://www.gitignore.io/api/python,emacs diff --git a/ci.sh b/ci.sh new file mode 100755 index 0000000..0546b95 --- /dev/null +++ b/ci.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +cd "$(this_dir)" || exit + +. ~/bash_ci + +ci_run mypy goodreads +ci_run pylint -E goodreads + +ci_report_errors diff --git a/goodreads/__init__.py b/goodreads/__init__.py new file mode 100644 index 0000000..46ebfe2 --- /dev/null +++ b/goodreads/__init__.py @@ -0,0 +1,104 @@ +import os +from xml.dom.minidom import parseString # type: ignore + +BPATH = "/L/backups/goodreads" + +# TODO might be useful to keep track of updates?... +# then I need some sort of system to store diffs in generic way... +# althogh... coud use same mechanism as for filtering +def get_last() -> str: + return max(sorted([os.path.join(BPATH, f) for f in os.listdir(BPATH) if f.endswith('.xmll')])) + +_SP = '' + +def get_reviews(): + fname = get_last() + xmls = [] + with open(fname, 'r') as fo: + data = fo.read() + for xx in data.split(_SP): + if len(xx.strip()) == 0: + break + xmls.append(parseString(xx + _SP)) + return xmls + +def get_books(): + books = [] + for review in get_reviews(): + book_element = review.getElementsByTagName('book')[0] + title_element = book_element.getElementsByTagName('title')[0] + id_element = book_element.getElementsByTagName('id')[0] + isbn_element = book_element.getElementsByTagName('isbn')[0] + isbn13_element = book_element.getElementsByTagName('isbn13')[0] + date_added = review.getElementsByTagName('date_added')[0] + started_at = review.getElementsByTagName('started_at')[0] + read_at = review.getElementsByTagName('read_at')[0] + + shelves_element = review.getElementsByTagName('shelves')[0] + book_shelves = [] + for shelf in shelves_element.getElementsByTagName('shelf'): + book_shelves.append(shelf.getAttribute('name')) + + book = { + 'title': title_element.firstChild.data, + 'id': id_element.firstChild.data, + 'shelves': book_shelves + } + + if isbn_element.getAttribute('nil') != 'true': + book['isbn'] = isbn_element.firstChild.data + else: + book['isbn'] = '' + + if isbn13_element.getAttribute('nil') != 'true': + book['isbn13'] = isbn13_element.firstChild.data + else: + book['isbn13'] = '' + + if started_at.firstChild is not None: + book['started_at'] = started_at.firstChild.data + else: + book['started_at'] = '' + + if read_at.firstChild is not None: + book['read_at'] = read_at.firstChild.data + else: + book['read_at'] = '' + + book['date_added'] = None if date_added.firstChild is None else date_added.firstChild.data + + books.append(book) + return books + +from typing import List, Dict, NamedTuple +from datetime import datetime + +class Event(NamedTuple): + dt: datetime + summary: str + + +def _parse_date(s: str) -> datetime: + return datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y") + + +def get_events(): + events = [] + for b in get_books(): + added = _parse_date(b['date_added']) + title = b['title'] + events.append(Event( + dt=added, + summary=f'Added book "{title}"', # TODO shelf? + )) + # TODO finished? other updates? + return sorted(events, key=lambda e: e.dt) + +def main(): + for e in get_events(): + print(e) + + +if __name__ == '__main__': + main() + diff --git a/goodreads/__main__.py b/goodreads/__main__.py new file mode 100644 index 0000000..e69de29 diff --git a/run b/run new file mode 100755 index 0000000..d610c53 --- /dev/null +++ b/run @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +cd "$(dirname "$0")" + +python3 -m goodreads From 0fa5b0c1e0502bcda6f2a6f5f768ec23e4b072b0 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 5 Sep 2018 15:13:48 +0400 Subject: [PATCH 2/5] main --- goodreads/__init__.py | 9 --------- goodreads/__main__.py | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/goodreads/__init__.py b/goodreads/__init__.py index 46ebfe2..6cca4e4 100644 --- a/goodreads/__init__.py +++ b/goodreads/__init__.py @@ -93,12 +93,3 @@ def get_events(): )) # TODO finished? other updates? return sorted(events, key=lambda e: e.dt) - -def main(): - for e in get_events(): - print(e) - - -if __name__ == '__main__': - main() - diff --git a/goodreads/__main__.py b/goodreads/__main__.py index e69de29..f38c577 100644 --- a/goodreads/__main__.py +++ b/goodreads/__main__.py @@ -0,0 +1,9 @@ +from goodreads import get_events + +def main(): + for e in get_events(): + print(e) + + +if __name__ == '__main__': + main() From 58eb036e5a53afdafdec0b8adc16d9e589976250 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 8 May 2019 21:09:06 +0100 Subject: [PATCH 3/5] simplify provider --- ci.sh | 10 ---------- goodreads/__init__.py | 20 ++++++++++++++------ goodreads/__main__.py | 9 --------- run | 6 ------ 4 files changed, 14 insertions(+), 31 deletions(-) delete mode 100755 ci.sh delete mode 100644 goodreads/__main__.py delete mode 100755 run diff --git a/ci.sh b/ci.sh deleted file mode 100755 index 0546b95..0000000 --- a/ci.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -cd "$(this_dir)" || exit - -. ~/bash_ci - -ci_run mypy goodreads -ci_run pylint -E goodreads - -ci_report_errors diff --git a/goodreads/__init__.py b/goodreads/__init__.py index 6cca4e4..09a7aa8 100644 --- a/goodreads/__init__.py +++ b/goodreads/__init__.py @@ -1,13 +1,16 @@ -import os +from pathlib import Path +from typing import List, Dict, NamedTuple +from datetime import datetime + from xml.dom.minidom import parseString # type: ignore -BPATH = "/L/backups/goodreads" +BPATH = Path("/L/backups/goodreads") # TODO might be useful to keep track of updates?... # then I need some sort of system to store diffs in generic way... # althogh... coud use same mechanism as for filtering -def get_last() -> str: - return max(sorted([os.path.join(BPATH, f) for f in os.listdir(BPATH) if f.endswith('.xmll')])) +def get_last() -> Path: + return max(sorted(BPATH.glob('*.xmll'))) _SP = '' @@ -22,6 +25,7 @@ def get_reviews(): xmls.append(parseString(xx + _SP)) return xmls + def get_books(): books = [] for review in get_reviews(): @@ -70,12 +74,11 @@ def get_books(): books.append(book) return books -from typing import List, Dict, NamedTuple -from datetime import datetime class Event(NamedTuple): dt: datetime summary: str + eid: str def _parse_date(s: str) -> datetime: @@ -90,6 +93,11 @@ def get_events(): events.append(Event( dt=added, summary=f'Added book "{title}"', # TODO shelf? + eid=b['id'], )) # TODO finished? other updates? return sorted(events, key=lambda e: e.dt) + + +def test(): + assert len(get_events()) > 20 diff --git a/goodreads/__main__.py b/goodreads/__main__.py deleted file mode 100644 index f38c577..0000000 --- a/goodreads/__main__.py +++ /dev/null @@ -1,9 +0,0 @@ -from goodreads import get_events - -def main(): - for e in get_events(): - print(e) - - -if __name__ == '__main__': - main() diff --git a/run b/run deleted file mode 100755 index d610c53..0000000 --- a/run +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -eu - -cd "$(dirname "$0")" - -python3 -m goodreads From af2590ab4313df166a3f8f57e64e5908b0601baf Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 18 May 2019 09:54:01 +0100 Subject: [PATCH 4/5] better extraction --- goodreads/__init__.py | 150 +++++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 47 deletions(-) mode change 100644 => 100755 goodreads/__init__.py diff --git a/goodreads/__init__.py b/goodreads/__init__.py old mode 100644 new mode 100755 index 09a7aa8..9a2506d --- a/goodreads/__init__.py +++ b/goodreads/__init__.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python3 from pathlib import Path -from typing import List, Dict, NamedTuple +from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence from datetime import datetime +import pytz -from xml.dom.minidom import parseString # type: ignore +from xml.dom.minidom import parseString, Element # type: ignore BPATH = Path("/L/backups/goodreads") @@ -25,54 +27,77 @@ def get_reviews(): xmls.append(parseString(xx + _SP)) return xmls +class Book(NamedTuple): + bid: str + title: str + authors: Sequence[str] + shelves: Sequence[str] + date_added: datetime + date_started: Optional[datetime] + date_read: Optional[datetime] -def get_books(): - books = [] +from kython import the + + +def _parse_date(s: Optional[str]) -> Optional[datetime]: + if s is None: + return None + res = datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y") + assert res.tzinfo is not None + return res + + +def iter_books() -> Iterator[Book]: for review in get_reviews(): - book_element = review.getElementsByTagName('book')[0] - title_element = book_element.getElementsByTagName('title')[0] - id_element = book_element.getElementsByTagName('id')[0] - isbn_element = book_element.getElementsByTagName('isbn')[0] - isbn13_element = book_element.getElementsByTagName('isbn13')[0] - date_added = review.getElementsByTagName('date_added')[0] - started_at = review.getElementsByTagName('started_at')[0] - read_at = review.getElementsByTagName('read_at')[0] + review_xml = the(review.childNodes) + rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)} + # fuck xml... - shelves_element = review.getElementsByTagName('shelves')[0] + book_element = rdict['book'] + title = the(the(book_element.getElementsByTagName('title')).childNodes).data + + id_element = rdict['id'] + # isbn_element = the(book_element.getElementsByTagName('isbn')) + # isbn13_element = the(book_element.getElementsByTagName('isbn13')) + date_added = the(rdict['date_added'].childNodes).data + sss = rdict['started_at'].childNodes + rrr = rdict['read_at'].childNodes + started_at = None if len(sss) == 0 else the(sss).data + read_at = None if len(rrr) == 0 else the(rrr).data + + shelves_element = rdict['shelves'] book_shelves = [] for shelf in shelves_element.getElementsByTagName('shelf'): book_shelves.append(shelf.getAttribute('name')) - book = { - 'title': title_element.firstChild.data, - 'id': id_element.firstChild.data, - 'shelves': book_shelves - } + # if isbn_element.getAttribute('nil') != 'true': + # book['isbn'] = isbn_element.firstChild.data + # else: + # book['isbn'] = '' - if isbn_element.getAttribute('nil') != 'true': - book['isbn'] = isbn_element.firstChild.data - else: - book['isbn'] = '' + # if isbn13_element.getAttribute('nil') != 'true': + # book['isbn13'] = isbn13_element.firstChild.data + # else: + # book['isbn13'] = '' - if isbn13_element.getAttribute('nil') != 'true': - book['isbn13'] = isbn13_element.firstChild.data - else: - book['isbn13'] = '' + da = _parse_date(date_added) + assert da is not None + yield Book( + bid=id_element.firstChild.data, + title=title, + shelves=book_shelves, + date_added=da, + date_started=_parse_date(started_at), + date_read=_parse_date(read_at), + ) - if started_at.firstChild is not None: - book['started_at'] = started_at.firstChild.data - else: - book['started_at'] = '' +def get_books(): + return list(iter_books()) - if read_at.firstChild is not None: - book['read_at'] = read_at.firstChild.data - else: - book['read_at'] = '' - book['date_added'] = None if date_added.firstChild is None else date_added.firstChild.data - - books.append(book) - return books +def test_books(): + books = get_books() + assert len(books) > 10 class Event(NamedTuple): @@ -81,19 +106,13 @@ class Event(NamedTuple): eid: str -def _parse_date(s: str) -> datetime: - return datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y") - - def get_events(): events = [] for b in get_books(): - added = _parse_date(b['date_added']) - title = b['title'] events.append(Event( - dt=added, - summary=f'Added book "{title}"', # TODO shelf? - eid=b['id'], + dt=b.date_added, + summary=f'Added book "{b.title}"', # TODO shelf? + eid=b.bid )) # TODO finished? other updates? return sorted(events, key=lambda e: e.dt) @@ -101,3 +120,40 @@ def get_events(): def test(): assert len(get_events()) > 20 + + +def print_read_history(): + def key(b): + read = b.date_read + if read is None: + return datetime.fromtimestamp(0, pytz.utc) + else: + return read + + def fmtdt(dt): + if dt is None: + return dt + tz = pytz.timezone('Europe/London') + return dt.astimezone(tz) + for b in sorted(iter_books(), key=key): + print(b.title) + print(f' started : {fmtdt(b.date_started)}') + print(f' finished: {fmtdt(b.date_read)}') + print() + + +def main(): + import argparse + p = argparse.ArgumentParser() + sp = p.add_argument('mode', nargs='?') + args = p.parse_args() + + if args.mode == 'history': + print_read_history() + else: + assert args.mode is None + for b in iter_books(): + print(b) + +if __name__ == '__main__': + main() From 9e7a2b895c7797864578c73e38af65d5c43edc37 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 18 May 2019 10:12:18 +0100 Subject: [PATCH 5/5] nicer extraction, switch to lxml --- goodreads/__init__.py | 46 +++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/goodreads/__init__.py b/goodreads/__init__.py index 9a2506d..accac9f 100755 --- a/goodreads/__init__.py +++ b/goodreads/__init__.py @@ -4,7 +4,7 @@ from typing import List, Dict, NamedTuple, Iterator, Optional, Sequence from datetime import datetime import pytz -from xml.dom.minidom import parseString, Element # type: ignore +from lxml import etree as ET # type: ignore BPATH = Path("/L/backups/goodreads") @@ -24,7 +24,7 @@ def get_reviews(): for xx in data.split(_SP): if len(xx.strip()) == 0: break - xmls.append(parseString(xx + _SP)) + xmls.append(ET.fromstring(xx + _SP)) return xmls class Book(NamedTuple): @@ -48,27 +48,25 @@ def _parse_date(s: Optional[str]) -> Optional[datetime]: def iter_books() -> Iterator[Book]: - for review in get_reviews(): - review_xml = the(review.childNodes) - rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)} + for r in get_reviews(): + # review_xml = the(review.childNodes) + # rdict = {n.tagName: n for n in review_xml.childNodes if isinstance(n, Element)} # fuck xml... - book_element = rdict['book'] - title = the(the(book_element.getElementsByTagName('title')).childNodes).data + be = the(r.xpath('book')) + title = the(be.xpath('title/text()')) + authors = be.xpath('authors/author/name/text()') - id_element = rdict['id'] + bid = the(r.xpath('id/text()')) # isbn_element = the(book_element.getElementsByTagName('isbn')) # isbn13_element = the(book_element.getElementsByTagName('isbn13')) - date_added = the(rdict['date_added'].childNodes).data - sss = rdict['started_at'].childNodes - rrr = rdict['read_at'].childNodes - started_at = None if len(sss) == 0 else the(sss).data - read_at = None if len(rrr) == 0 else the(rrr).data + date_added = the(r.xpath('date_added/text()')) + sss = r.xpath('started_at/text()') + rrr = r.xpath('read_at/text()') + started_at = None if len(sss) == 0 else the(sss) + read_at = None if len(rrr) == 0 else the(rrr) - shelves_element = rdict['shelves'] - book_shelves = [] - for shelf in shelves_element.getElementsByTagName('shelf'): - book_shelves.append(shelf.getAttribute('name')) + shelves = r.xpath('shelves/shelf/name/text()') # if isbn_element.getAttribute('nil') != 'true': # book['isbn'] = isbn_element.firstChild.data @@ -83,9 +81,10 @@ def iter_books() -> Iterator[Book]: da = _parse_date(date_added) assert da is not None yield Book( - bid=id_element.firstChild.data, + bid=bid, title=title, - shelves=book_shelves, + authors=authors, + shelves=shelves, date_added=da, date_started=_parse_date(started_at), date_read=_parse_date(read_at), @@ -136,10 +135,11 @@ def print_read_history(): tz = pytz.timezone('Europe/London') return dt.astimezone(tz) for b in sorted(iter_books(), key=key): - print(b.title) - print(f' started : {fmtdt(b.date_started)}') - print(f' finished: {fmtdt(b.date_read)}') - print() + print(f""" +{b.title} by {', '.join(b.authors)} + started : {fmtdt(b.date_started)} + finished: {fmtdt(b.date_read)} + """) def main():