diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dd19df..d60ef35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ General/my.core changes: - e81dddddf083ffd81aa7e2b715bd34f59949479c properly resolve class properties in make_config + add test Modules: -- some innitial work on filling **InfluxDB** with HPI data +- some initial work on filling **InfluxDB** with HPI data - pinboard - 42399f6250d9901d93dcedcfe05f7857babcf834: **breaking backwards compatibility**, use pinbexport module directly diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org index a573007..7bafa48 100644 --- a/doc/OVERLAYS.org +++ b/doc/OVERLAYS.org @@ -10,7 +10,7 @@ Relevant discussion about overlays: https://github.com/karlicoss/HPI/issues/102 # You can see them TODO in overlays dir -Consider a toy package/module structure with minimal code, wihout any actual data parsing, just for demonstration purposes. +Consider a toy package/module structure with minimal code, without any actual data parsing, just for demonstration purposes. - =main= package structure # TODO do links @@ -19,7 +19,7 @@ Consider a toy package/module structure with minimal code, wihout any actual dat Extracts Twitter data from GDPR archive. - =my/twitter/all.py= Merges twitter data from multiple sources (only =gdpr= in this case), so data consumers are agnostic of specific data sources used. - This will be overriden by =overlay=. + This will be overridden by =overlay=. - =my/twitter/common.py= Contains helper function to merge data, so they can be reused by overlay's =all.py=. - =my/reddit.py= @@ -126,7 +126,7 @@ https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/myp For now, I opened an issue in mypy repository https://github.com/python/mypy/issues/16683 -But ok, maybe mypy treats =main= as an external package somhow but still type checks it properly? +But ok, maybe mypy treats =main= as an external package somehow but still type checks it properly? Let's see what's going on with imports: : $ mypy --namespace-packages --strict -p my --follow-imports=error diff --git a/doc/QUERY.md b/doc/QUERY.md index a85450a..9a5d9d3 100644 --- a/doc/QUERY.md +++ b/doc/QUERY.md @@ -97,7 +97,7 @@ By default, this just returns the items in the order they were returned by the f hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), - message='sources.smscalls: propogate errors if there are breaking ' + message='sources.smscalls: propagate errors if there are breaking ' 'schema changes', repo='/home/username/Repos/promnesia-fork', sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', diff --git a/my/core/cachew.py b/my/core/cachew.py index 9ccee09..8ce2f2b 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -136,7 +136,7 @@ if TYPE_CHECKING: CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug PathProvider = Union[PathIsh, Callable[P, PathIsh]] # NOTE: in cachew, HashFunction type returns str - # however in practice, cachew alwasy calls str for its result + # however in practice, cachew always calls str for its result # so perhaps better to switch it to Any in cachew as well HashFunction = Callable[P, Any] diff --git a/my/core/konsume.py b/my/core/konsume.py index 6d24167..41b5a4e 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -236,7 +236,7 @@ def test_zoom() -> None: # - very flexible, easy to adjust behaviour # - cons: # - can forget to assert about extra entities etc, so error prone -# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes erro handling harder +# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes error handling harder # - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though) # - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements # - TODO perhaps combine warnings somehow or at least only emit once per module? diff --git a/my/core/logging.py b/my/core/logging.py index bdee9aa..167a167 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -250,7 +250,7 @@ if __name__ == '__main__': test() -## legacy/deprecated methods for backwards compatilibity +## legacy/deprecated methods for backwards compatibility if not TYPE_CHECKING: from .compat import deprecated diff --git a/my/core/tests/test_tmp_config.py b/my/core/tests/test_tmp_config.py index e5a24cc..d99621d 100644 --- a/my/core/tests/test_tmp_config.py +++ b/my/core/tests/test_tmp_config.py @@ -12,7 +12,7 @@ def _init_default_config() -> None: def test_tmp_config() -> None: ## ugh. ideally this would be on the top level (would be a better test) - ## but pytest imports eveything first, executes hooks, and some reset_modules() fictures mess stuff up + ## but pytest imports everything first, executes hooks, and some reset_modules() fictures mess stuff up ## later would be nice to be a bit more careful about them _init_default_config() from my.simple import items diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 501ebbe..42b2b77 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -321,7 +321,7 @@ _UET = TypeVar('_UET') _UEU = TypeVar('_UEU') -# NOTE: for historic reasons, this function had to accept Callable that retuns iterator +# NOTE: for historic reasons, this function had to accept Callable that returns iterator # instead of just iterator # TODO maybe deprecated Callable support? not sure def unique_everseen( @@ -358,7 +358,7 @@ def test_unique_everseen() -> None: assert list(unique_everseen(fun_good)) == [123] with pytest.raises(Exception): - # since function retuns a list rather than iterator, check happens immediately + # since function returns a list rather than iterator, check happens immediately # , even without advancing the iterator unique_everseen(fun_bad) diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index f729de9..e5e417c 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -9,7 +9,7 @@ since that allows for easier overriding using namespace packages See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. """ -# prevent it from apprearing in modules list/doctor +# prevent it from appearing in modules list/doctor from ..core import __NOT_HPI_MODULE__ # kinda annoying to keep it, but it's so legacy 'hpi module install my.fbmessenger' works diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a16d924..db4cc54 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -174,7 +174,7 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: However seems that when message is not sent yet it doesn't have this server id yet (happened only once, but could be just luck of course!) We exclude these messages to avoid duplication. - However poisitive filter (e.g. message_id LIKE 'mid%') feels a bit wrong, e.g. what if mesage ids change or something + However poisitive filter (e.g. message_id LIKE 'mid%') feels a bit wrong, e.g. what if message ids change or something So instead this excludes only such unsent messages. */ message_id != offline_threading_id diff --git a/my/instagram/all.py b/my/instagram/all.py index 214e6ac..ce78409 100644 --- a/my/instagram/all.py +++ b/my/instagram/all.py @@ -23,7 +23,7 @@ def messages() -> Iterator[Res[Message]]: # TODO in general best to prefer android, it has more data # - message ids # - usernames are correct for Android data - # - thread ids more meaninful? + # - thread ids more meaningful? # but for now prefer gdpr prefix since it makes a bit things a bit more consistent? # e.g. a new batch of android exports can throw off ids if we rely on it for mapping yield from _merge_messages( diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 7454a04..d417fdb 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -76,7 +76,7 @@ def _entities() -> Iterator[Res[User | _Message]]: # NOTE: here there are basically two options # - process inputs as is (from oldest to newest) # this would be more stable wrt newer exports (e.g. existing thread ids won't change) - # the downside is that newer exports seem to have better thread ids, so might be preferrable to use them + # the downside is that newer exports seem to have better thread ids, so might be preferable to use them # - process inputs reversed (from newest to oldest) # the upside is that thread ids/usernames might be better # the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable.. @@ -137,7 +137,7 @@ def _entitites_from_path(path: Path) -> Iterator[Res[User | _Message]]: j = json.loads(ffile.read_text()) id_len = 10 - # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation + # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole conversation # but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :( # so basically the only way to merge is to actually try some magic and correlate timestamps/message texts? # another option is perhaps to query user id from username with some free API diff --git a/my/reddit/__init__.py b/my/reddit/__init__.py index f344eeb..982901a 100644 --- a/my/reddit/__init__.py +++ b/my/reddit/__init__.py @@ -9,7 +9,7 @@ since that allows for easier overriding using namespace packages See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. """ -# prevent it from apprearing in modules list/doctor +# prevent it from appearing in modules list/doctor from ..core import __NOT_HPI_MODULE__ # kinda annoying to keep it, but it's so legacy 'hpi module install my.reddit' works diff --git a/my/smscalls.py b/my/smscalls.py index ccaac72..324bc44 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -186,7 +186,7 @@ class MMS(NamedTuple): for (addr, _type) in self.addresses: if _type == 137: return addr - # hmm, maybe return instead? but this probably shouldnt happen, means + # hmm, maybe return instead? but this probably shouldn't happen, means # something is very broken raise RuntimeError(f'No from address matching 137 found in {self.addresses}') @@ -214,7 +214,7 @@ def mms() -> Iterator[Res[MMS]]: def _resolve_null_str(value: str | None) -> str | None: if value is None: return None - # hmm.. theres some risk of the text actually being 'null', but theres + # hmm.. there's some risk of the text actually being 'null', but there's # no way to distinguish that from XML values if value == 'null': return None diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py index 78987be..8ed0d30 100644 --- a/my/stackexchange/gdpr.py +++ b/my/stackexchange/gdpr.py @@ -49,7 +49,7 @@ class Vote(NamedTuple): # hmm, this loads very raw comments without the rest of the page? # - https://meta.stackexchange.com/posts/27319/comments#comment-57475 # - # parentPostId is the original quesion + # parentPostId is the original question # TODO is not always present? fucking hell # seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation... # postId is the answer diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 58b5bf7..1b2275b 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -245,7 +245,7 @@ def _iter_tzs() -> Iterator[DayWithZone]: def _day2zone() -> dict[date, pytz.BaseTzInfo]: # NOTE: kinda unfortunate that this will have to process all days before returning result for just one # however otherwise cachew cache might never be initialized properly - # so we'll always end up recomputing everyting during subsequent runs + # so we'll always end up recomputing everything during subsequent runs return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()} diff --git a/my/tinder/android.py b/my/tinder/android.py index a09794f..5a5d887 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -106,7 +106,7 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: user_profile_rows = list(db.execute('SELECT * FROM profile_user_view')) if len(user_profile_rows) == 0: - # shit, sometime in 2023 profile_user_view stoppped containing user profile.. + # shit, sometime in 2023 profile_user_view stopped containing user profile.. # presumably the most common from_id/to_id would be our own username counter = Counter([id_ for (id_,) in db.execute('SELECT from_id FROM message UNION ALL SELECT to_id FROM message')]) if len(counter) > 0: # this might happen if db is empty (e.g. user got logged out) diff --git a/my/topcoder.py b/my/topcoder.py index 56403e2..40df77c 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -81,7 +81,7 @@ def _parse_one(p: Path) -> Iterator[Res[Competition]]: # but also expects cooperation from .make method (e.g. popping items from the dict) # could also wrap in helper and pass to .make .. not sure # an argument could be made that .make isn't really a class methond.. - # it's pretty specific to this parser onl + # it's pretty specific to this parser only yield from Competition.make(j=c) yield from m.check() diff --git a/my/twitter/android.py b/my/twitter/android.py index 88c9389..8159ee7 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -192,7 +192,7 @@ def get_own_user_id(conn) -> str: # - timeline_data_type # 1 : the bulk of tweets, but also some notifications etc?? # 2 : who-to-follow/community-to-join. contains a couple of tweets, but their corresponding status_id is NULL -# 8 : who-to-follow/notfication +# 8 : who-to-follow/notification # 13: semantic-core/who-to-follow # 14: cursor # 17: trends diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 5106923..9d36a93 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -54,7 +54,7 @@ class Tweet(NamedTuple): # https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62 # # twint is also saving 'timezone', but this is local machine timezone at the time of scraping? - # perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db) + # perhaps they thought date-time-ms was local time... or just kept it just in case (they are keeping lots on unnecessary stuff in the db) return datetime.fromtimestamp(seconds, tz=tz) @property diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index 3cd4436..a8dbe8d 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -199,7 +199,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: sender_row_id = r['sender_jid_row_id'] if sender_row_id == 0: # seems that it's always 0 for 1-1 chats - # for group chats our onw id is still 0, but other ids are properly set + # for group chats our own id is still 0, but other ids are properly set if from_me: myself_user_id = config.my_user_id or 'MYSELF_USER_ID' sender = Sender(id=myself_user_id, name=None) # TODO set my own name as well? diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 703715f..8eca328 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -36,7 +36,7 @@ def watched() -> Iterator[Res[Watched]]: continue # older exports (e.g. html) didn't have microseconds - # wheras newer json ones do have them + # whereas newer json ones do have them # seconds resolution is enough to distinguish watched videos # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data without_microsecond = w.when.replace(microsecond=0)