Merge pull request #53 from karlicoss/upd

make my.twitter.all easier to override
2020-05-24 23:02:57 +01:00 · 2020-05-24 23:02:57 +01:00 · af814df8e9
commit af814df8e9
parent c410daa484 f5267d05d7
12 changed files with 195 additions and 127 deletions
--- a/doc/CONFIGURING.org
+++ b/doc/CONFIGURING.org
@ -1,3 +1,6 @@
+This doc describes the technical decisions behind HPI configuration system.
+If you just want to know how to set it up, see [[file:SETUP.org][SETUP]].
+
 I feel like it's good to keep the rationales in the documentation,
 but happy to [[https://github.com/karlicoss/HPI/issues/46][discuss]] it here.

--- a/doc/DEVELOPMENT.org
+++ b/doc/DEVELOPMENT.org
@ -8,7 +8,6 @@
 - [[#running-tests][Running tests]]
 - [[#ide-setup][IDE setup]]
 - [[#linting][Linting]]
- [[#modifyingadding-modules][Modifying/adding modules]]
 :END:

 * Running tests
@ -18,9 +17,9 @@ and [[file:../scripts/ci/run]] for the up to date info on the specifics.
 * IDE setup
 To benefit from type hinting, make sure =my.config= is in your package search path.

-In runtime, ~my.config~ is imported from the user config directory dynamically.
+In runtime, ~my.config~ is imported from the user config directory [[file:../my/core/init.py][dynamically]].

-However, Pycharm/Emacs/whatever you use won't be able to figure that out, so you'd need to adjust your IDE configuration.
+However, Pycharm/Emacs or whatever IDE you are using won't be able to figure that out, so you'd need to adjust your IDE configuration.

 - Pycharm: basically, follow the instructions [[https://stackoverflow.com/a/55278260/706389][here]]

@ -30,33 +29,3 @@ However, Pycharm/Emacs/whatever you use won't be able to figure that out, so you
 You should be able to use [[file:../lint]] script to run mypy checks.

 [[file:../mypy.ini]] points at =~/.config/my= by default.
-
-
-* Modifying/adding modules
-
-The easiest is just to run HPI via [[file:SETUP.org::#use-without-installing][with_my]] wrapper or with an editable PIP install.
-That way your changes will be reflected immediately, and you will be able to quickly iterate/fix bugs/add new methods.
-
-The "proper way" (unless you want to contribute to the upstream) is to create a separate hierarchy and add your module to =PYTHONPATH=.
-
-For example, if you want to add an =awesomedatasource=, it could be:
-
-: custom_module
-: └── my
-:     └──awesomedatasource.py
-
-You can use all existing HPI modules in =awesomedatasource.py=, for example, =my.config=, or everything from =my.core=.
-
-But also, you can use all the previously defined HPI modules too. This could be useful to *shadow/override* existing HPI module:
-
-: custom_reddit_overlay
-: └── my
-:     └──reddit.py
-
-Now if you add =my_reddit_overlay= *in the front* of ~PYTHONPATH~, all the downstream scripts using =my.reddit= will load it from =custom_reddit_overlay= instead.
-
-This could be useful to monkey patch some behaviours, or dynamically add some extra data sources -- anything that comes to your mind.
-
-I'll put up a better guide on this, in the meantime see [[https://packaging.python.org/guides/packaging-namespace-packages]["namespace packages"]] for more info.
-
-# TODO add example with overriding 'all'
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@ -36,6 +36,7 @@ Some explanations:
  - =/a/path/to/directory/=, so the module will consume all files from this directory
  - a list of files/directories (it will be flattened)
  - a [[https://docs.python.org/3/library/glob.html?highlight=glob#glob.glob][glob]] string, so you can be flexible about the format of your data on disk (e.g. if you want to keep it compressed)
+  - empty sequence (e.g. ~export_path = ()~), this is useful for modules that merge multiple data sources (for example, =my.twitter=)

  Typically, such variable will be passed to =get_files= to actually extract the list of real files to use. You can see usage examples [[https://github.com/karlicoss/HPI/blob/master/tests/get_files.py][here]].

@ -55,14 +56,14 @@ import importlib
 # from lint import all_modules # meh
 # TODO figure out how to discover configs automatically...
 modules = [
-    ('google'     , 'my.google.takeout.paths'),
-    ('hypothesis' , 'my.hypothesis'          ),
-    ('reddit'     , 'my.reddit'              ),
-    ('twint'      , 'my.twitter.twint'       ),
-    ('twitter'    , 'my.twitter.archive'     ),
-    ('lastfm'     , 'my.lastfm'              ),
-    ('polar'      , 'my.reading.polar'       ),
-    ('instapaper' , 'my.instapaper'          ),
+    ('google'         , 'my.google.takeout.paths'),
+    ('hypothesis'     , 'my.hypothesis'          ),
+    ('reddit'         , 'my.reddit'              ),
+    ('twint'          , 'my.twitter.twint'       ),
+    ('twitter_archive', 'my.twitter.archive'     ),
+    ('lastfm'         , 'my.lastfm'              ),
+    ('polar'          , 'my.reading.polar'       ),
+    ('instapaper'     , 'my.instapaper'          ),
 ]

 def indent(s, spaces=4):
@ -146,6 +147,8 @@ for cls, p in modules:

    Uses [[https://github.com/twintproject/twint][Twint]] data export.

+    Requirements: =pip3 install --user dataset=
+
    #+begin_src python
    class twint:
        export_path: Paths # path[s]/glob to the twint Sqlite database
@ -155,7 +158,7 @@ for cls, p in modules:
    Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])

    #+begin_src python
-    class twitter:
+    class twitter_archive:
        export_path: Paths # path[s]/glob to the twitter archive takeout
    #+end_src
 ** [[file:../my/lastfm][my.lastfm]]
@ -171,7 +174,7 @@ for cls, p in modules:
    #+end_src
 ** [[file:../my/reading/polar.py][my.reading.polar]]

-    [[https://github.com/burtonator/polar-books][Polar]] articles and highlights
+    [[https://github.com/burtonator/polar-bookshelf][Polar]] articles and highlights

    #+begin_src python
    class polar:
--- a/doc/SETUP.org
+++ b/doc/SETUP.org
@ -11,12 +11,12 @@ You'd be really helping me, I want to make the setup as straightforward as possi
 :CONTENTS:
 - [[#toc][TOC]]
 - [[#few-notes][Few notes]]
- [[#setting-up-the-main-package][Setting up the main package]]
+- [[#install-main-hpi-package][Install main HPI package]]
  - [[#option-1-install-from-pip][option 1: install from PIP]]
-  - [[#option-2-local-install][option 2: local install]]
+  - [[#option-2-localeditable-install][option 2: local/editable install]]
  - [[#option-3-use-without-installing][option 3: use without installing]]
- [[#optional-packages][Optional packages]]
- [[#setting-up-the-modules][Setting up the modules]]
+  - [[#appendix-optional-packages][appendix: optional packages]]
+- [[#setting-up-modules][Setting up modules]]
  - [[#private-configuration-myconfig][private configuration (my.config)]]
  - [[#module-dependencies][module dependencies]]
 - [[#usage-examples][Usage examples]]
@ -27,6 +27,7 @@ You'd be really helping me, I want to make the setup as straightforward as possi
  - [[#orger][Orger]]
    - [[#orger--polar][Orger + Polar]]
  - [[#demopy][demo.py]]
+- [[#addingmodifying-modules][Adding/modifying modules]]
 :END:


@ -45,7 +46,7 @@ I understand people may not super familiar with Python, PIP or generally unix, s

  See [[https://github.com/ActiveState/appdirs/blob/3fe6a83776843a46f20c2e5587afcffe05e03b39/appdirs.py#L187-L190][this]] if you're not sure what's your user config dir.

-* Setting up the main package
+* Install main HPI package
 This is a *required step*

 You can choose one of the following options:
@ -55,7 +56,7 @@ This is the *easiest way*:

 : pip3 install --user HPI

-** option 2: local install
+** option 2: local/editable install
 This is convenient if you're planning to add new modules or change the existing ones.

 1. Clone the repository: =git clone git@github.com:karlicoss/HPI.git /path/to/hpi=
@ -63,7 +64,7 @@ This is convenient if you're planning to add new modules or change the existing
 2. Run  ~pip3 install --user -e .~

   This will install the package in 'editable mode'.
-   It will basically be a link to =/path/to/hpi=, which means any changes in the cloned repo will be immediately reflected without need to reinstall anything.
+   It means that any changes to =/path/to/hpi= will be immediately reflected without need to reinstall anything.

   It's *extremely* convenient for developing and debugging.
  
@ -87,12 +88,12 @@ This is less convenient, but gives you more control.

 The benefit of this way is that you get a bit more control, explicitly allowing your scripts to use your data.

-* Optional packages
+** appendix: optional packages
 You can also install some opional packages

 : pip3 install 'HPI[optional]'

-They aren't necessary, but improve your experience. At the moment these are:
+They aren't necessary, but will improve your experience. At the moment these are:

 - [[https://github.com/karlicoss/cachew][cachew]]: automatic caching library, which can greatly speedup data access
 - [[https://github.com/metachris/logzero][logzero]]: a nice logging library, supporting colors
@ -223,12 +224,20 @@ Generally you can just try using the module and then install missing packages vi
 If you run your script with ~with_my~ wrapper, you'd have ~my~ in ~PYTHONPATH~ which gives you access to your data from within the script.

 ** End-to-end Roam Research setup
-In [[https://beepb00p.xyz/myinfra-roam.html#export][this]] post you can trace all steps starting from exporting your data to integrating with HPI package.
+In [[https://beepb00p.xyz/myinfra-roam.html#export][this]] post you can trace all steps:
+
+- learn how to export your raw data
+- integrate it with HPI package
+- benefit from HPI integration
+
+  - use interactively in ipython
+  - use with [[https://github.com/karlicoss/orger][Orger]]
+  - use with [[https://github.com/karlicoss/promnesia][Promnesia]]

 If you want to set up a new data source, it could be a good learning reference.

 ** Polar
-Polar doesn't require any setup as it accesses the highlights on your filesystem (should be in =~/.polar=).
+Polar doesn't require any setup as it accesses the highlights on your filesystem (usually in =~/.polar=).

 You can try if it works with:

@ -254,7 +263,7 @@ If you have zip Google Takeout archives, you can use HPI to access it:


 ** Kobo reader
-Kobo provider allows you to access the books you've read along with the highlights and notes.
+Kobo module allows you to access the books you've read along with the highlights and notes.
 It uses exports provided by [[https://github.com/karlicoss/kobuddy][kobuddy]] package.

 - prepare the config
@ -265,6 +274,7 @@ It uses exports provided by [[https://github.com/karlicoss/kobuddy][kobuddy]] pa
    class kobo:
        export_dir = 'path/to/kobo/exports'
    #+end_src
+    # TODO FIXME kobuddy path

 After that you should be able to use it:

@ -281,9 +291,42 @@ Some examples (assuming you've [[https://github.com/karlicoss/orger#installing][

 *** Orger + [[https://github.com/burtonator/polar-bookshelf][Polar]]

-This will convert Polar highlights into org-mode:
+This will mirror Polar highlights as org-mode:

 : orger/modules/polar.py --to polar.org

 ** =demo.py=
-read/run [[../demo.py][demo.py]] for a full demonstration of setting up Hypothesis (it uses public annotations data from Github)
+read/run [[../demo.py][demo.py]] for a full demonstration of setting up Hypothesis (uses annotations data from a public Github repository)
+
+* Adding/modifying modules
+# TODO link to 'overlays' documentation?
+# TODO don't be afraid to TODO make sure to install in editable mode
+
+The easiest is just to run HPI via [[#use-without-installing][with_my]] wrapper or with an editable PIP install.
+That way your changes will be reflected immediately, and you will be able to quickly iterate/fix bugs/add new methods.
+
+# TODO eh. doesn't even have to be in 'my' namespace?? need to check it
+The "proper way" (unless you want to contribute to the upstream) is to create a separate file hierarchy and add your module to =PYTHONPATH=.
+
+For example, if you want to add an =awesomedatasource=, it could be:
+
+: custom_module
+: └── my
+:     └──awesomedatasource.py
+
+You can use all existing HPI modules in =awesomedatasource.py=, for example, =my.config=, or everything from =my.core=.
+
+But also, you can use *override* the builtin HPI modules too:
+
+: custom_reddit_overlay
+: └── my
+:     └──reddit.py
+
+# TODO confusing
+Now if you add =my_reddit_overlay= *in the front* of ~PYTHONPATH~, all the downstream scripts using =my.reddit= will load it from =custom_reddit_overlay= instead.
+
+This could be useful to monkey patch some behaviours, or dynamically add some extra data sources -- anything that comes to your mind.
+
+I'll put up a better guide on this, in the meantime see [[https://packaging.python.org/guides/packaging-namespace-packages]["namespace packages"]] for more info.
+
+# TODO add example with overriding 'all'
--- a/my/core/common.py
+++ b/my/core/common.py
@ -151,6 +151,12 @@ def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path,

    if sort:
        paths = list(sorted(paths))
+
+    if len(paths) == 0:
+        # todo make it conditionally defensive based on some global settings
+        # todo stacktrace?
+        warnings.warn(f'No paths were matched against {paths}. This might result in missing data.')
+
    return tuple(paths)


--- a/my/reading/polar.py
+++ b/my/reading/polar.py
@ -1,5 +1,5 @@
 """
-[[https://github.com/burtonator/polar-books][Polar]] articles and highlights
+[[https://github.com/burtonator/polar-bookshelf][Polar]] articles and highlights
 """
 from pathlib import Path
 from typing import Type, Any, cast, TYPE_CHECKING
--- a/my/twitter/all.py
+++ b/my/twitter/all.py
@ -1,24 +1,23 @@
 """
 Unified Twitter data (merged from the archive and periodic updates)
 """
-from itertools import chain

-from . import twint
-from . import archive
+# NOTE: you can comment out the sources you don't need


-# TODO move to .common?
-def merge_tweets(*sources):
-    from more_itertools import unique_everseen
-    yield from unique_everseen(
-        chain(*sources),
-        key=lambda t: t.id_str,
-    )
+from . import twint, archive
+from .common import merge_tweets


 def tweets():
-    yield from merge_tweets(twint.tweets(), archive.tweets())
+    yield from merge_tweets(
+        twint  .tweets(),
+        archive.tweets(),
+    )


 def likes():
-    yield from merge_tweets(twint.likes(), archive.likes())
+    yield from merge_tweets(
+        twint  .likes(),
+        archive.likes(),
+    )
--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@ -1,26 +1,40 @@
 """
 Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])
 """
+
+
+# before this config was named 'twitter', doesn't make too much sense for archive
+# try to import it defensively..
+try:
+    from my.config import twitter_archive as user_config
+except ImportError as e:
+    try:
+        from my.config import twitter as user_config
+    except ImportError:
+        raise e # raise the original exception.. must be somethingelse
+    else:
+        import warnings
+        warnings.warn('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
+
+
 from dataclasses import dataclass
 from ..core.common import Paths

-from my.config import twitter as user_config
-
+# TODO perhaps rename to twitter_archive? dunno
@dataclass
-class twitter(user_config):
+class twitter_archive(user_config):
    export_path: Paths # path[s]/glob to the twitter archive takeout


 ###

 from ..core.cfg import make_config
-config = make_config(twitter)
+config = make_config(twitter_archive)


 from datetime import datetime
-from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
+from typing import Union, List, Dict, Set, Optional, Iterable, Any, NamedTuple, Sequence
 from pathlib import Path
-from functools import lru_cache
 import json
 import zipfile

@ -34,8 +48,8 @@ from ..kython import kompress
 logger = LazyLogger(__name__)


-def _get_export() -> Path:
-    return max(get_files(config.export_path))
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)[-1:]


 Tid = str
@ -114,9 +128,10 @@ class Like(NamedTuple):
        return self.id_str


+from functools import lru_cache
 class ZipExport:
-    def __init__(self) -> None:
-        self.epath = _get_export()
+    def __init__(self, archive_path: Path) -> None:
+        self.epath = archive_path

        self.old_format = False # changed somewhere around 2020.03
        if not kompress.kexists(self.epath, 'Your archive.html'):
@ -148,62 +163,24 @@ class ZipExport:
        [acc] = self.raw('account')
        return acc['username']

-    def tweets(self) -> Iterator[Tweet]:
+    def tweets(self) -> Iterable[Tweet]:
        for r in self.raw('tweet'):
            yield Tweet(r, screen_name=self.screen_name())


-    def likes(self) -> Iterator[Like]:
+    def likes(self) -> Iterable[Like]:
        # TODO ugh. would be nice to unify Tweet/Like interface
        # however, akeout only got tweetId, full text and url
        for r in self.raw('like'):
            yield Like(r, screen_name=self.screen_name())


-def tweets() -> List[Tweet]:
-    return list(sorted(ZipExport().tweets(), key=lambda t: t.dt))
+# todo not sure about list and sorting? although can't hurt considering json is not iterative?
+def tweets() -> Iterable[Tweet]:
+    for inp in inputs():
+        yield from sorted(ZipExport(inp).tweets(), key=lambda t: t.dt)


-def likes() -> List[Like]:
-    return list(ZipExport().likes())
-
-
-def test_tweet():
-    raw = """
- {
-  "retweeted" : false,
-  "entities" : {
-    "hashtags" : [ ],
-    "symbols" : [ ],
-    "user_mentions" : [ ],
-    "urls" : [ {
-      "url" : "https://t.co/vUg4W6nxwU",
-      "expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
-      "display_url" : "intelligence.org/2013/12/13/aar…",
-      "indices" : [ "120", "143" ]
-    }
-    ]
-  },
-  "display_text_range" : [ "0", "90" ],
-  "favorite_count" : "0",
-  "in_reply_to_status_id_str" : "24123424",
-  "id_str" : "2328934829084",
-  "in_reply_to_user_id" : "23423424",
-  "truncated" : false,
-  "retweet_count" : "0",
-  "id" : "23492349032940",
-  "in_reply_to_status_id" : "23482984932084",
-  "created_at" : "Thu Aug 30 07:12:48 +0000 2012",
-  "favorited" : false,
-  "full_text" : "this is a test tweet",
-  "lang" : "ru",
-  "in_reply_to_screen_name" : "whatever",
-  "in_reply_to_user_id_str" : "3748274"
-}
-    """
-    t = Tweet(json.loads(raw), screen_name='whatever')
-    assert t.permalink is not None
-    assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
-    assert t.text == 'this is a test tweet'
-    assert t.tid  == '2328934829084'
-    assert t.entities is not None
+def likes() -> Iterable[Like]:
+    for inp in inputs():
+        yield from ZipExport(inp).likes()
--- a/my/twitter/common.py
+++ b/my/twitter/common.py
@ -0,0 +1,10 @@
+from itertools import chain
+
+from more_itertools import unique_everseen
+
+
+def merge_tweets(*sources):
+    yield from unique_everseen(
+        chain(*sources),
+        key=lambda t: t.id_str,
+    )
--- a/my/twitter/twint.py
+++ b/my/twitter/twint.py
@ -2,6 +2,8 @@
 Twitter data (tweets and favorites).

 Uses [[https://github.com/twintproject/twint][Twint]] data export.
+
+Requirements: =pip3 install --user dataset=
 """

 from ..core.common import Paths
--- a/tests/get_files.py
+++ b/tests/get_files.py
@ -97,6 +97,15 @@ def test_implicit_glob():
        Path('/tmp/hpi_test/456/file.zip'),
    )

+
+def test_no_files():
+    '''
+    Test for empty matches. They work, but should result in warning
+    '''
+    assert get_files([])         == ()
+    assert get_files('bad*glob') == ()
+
+
 # TODO not sure if should uniquify if the filenames end up same?
 # TODO not sure about the symlinks? and hidden files?

--- a/tests/tweets.py
+++ b/tests/tweets.py
@ -0,0 +1,47 @@
+from datetime import datetime
+import json
+
+import pytz
+
+from my.twitter.archive import Tweet
+
+
+def test_tweet():
+    raw = """
+ {
+  "retweeted" : false,
+  "entities" : {
+    "hashtags" : [ ],
+    "symbols" : [ ],
+    "user_mentions" : [ ],
+    "urls" : [ {
+      "url" : "https://t.co/vUg4W6nxwU",
+      "expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
+      "display_url" : "intelligence.org/2013/12/13/aar…",
+      "indices" : [ "120", "143" ]
+    }
+    ]
+  },
+  "display_text_range" : [ "0", "90" ],
+  "favorite_count" : "0",
+  "in_reply_to_status_id_str" : "24123424",
+  "id_str" : "2328934829084",
+  "in_reply_to_user_id" : "23423424",
+  "truncated" : false,
+  "retweet_count" : "0",
+  "id" : "23492349032940",
+  "in_reply_to_status_id" : "23482984932084",
+  "created_at" : "Thu Aug 30 07:12:48 +0000 2012",
+  "favorited" : false,
+  "full_text" : "this is a test tweet",
+  "lang" : "ru",
+  "in_reply_to_screen_name" : "whatever",
+  "in_reply_to_user_id_str" : "3748274"
+}
+    """
+    t = Tweet(json.loads(raw), screen_name='whatever')
+    assert t.permalink is not None
+    assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
+    assert t.text == 'this is a test tweet'
+    assert t.tid  == '2328934829084'
+    assert t.entities is not None