browser: parse browser history using browserexport (#216)

* browser: parse browser history using browserexport

from seanbreckenridge/HPI module:
1fba8ccf2f/my/browser/export.py
This commit is contained in:
seanbreckenridge 2022-02-13 15:56:05 -08:00 committed by GitHub
parent 059c4ae791
commit 9e5cd60ff2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 198 additions and 24 deletions

View file

@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into
You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
# Nested Configurations before the doc generation using the block below
** [[file:../my/reddit][my.reddit]]
Reddit data: saved items/comments/upvotes/etc.
# Note: can't be generated as easily since this is a nested configuration object
#+begin_src python
class reddit:
class rexport:
'''
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
class pushshift:
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/browser/][my.browser]]
Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
#+begin_src python
@dataclass
class browser:
class export:
# path[s]/glob to your backed up browser history sqlite files
export_path: Paths
class active_browser:
# paths to sqlite database files which you use actively
# to read from. For example:
# from browserexport.browsers.all import Firefox
# active_databases = Firefox.locate_database()
export_path: Paths
#+end_src
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
#+begin_src python :dir .. :results output drawer raw :exports result
@ -139,30 +183,6 @@ for cls, p in modules:
# paths[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/reddit][my.reddit]]
Reddit data: saved items/comments/upvotes/etc.
# Note: can't be generated as easily since this is a nested configuration object
#+begin_src python
class reddit:
class rexport:
'''
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
class pushshift:
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/pocket.py][my.pocket]]
[[https://getpocket.com][Pocket]] bookmarks and highlights

View file

@ -0,0 +1,50 @@
"""
Parses active browser history by backing it up with [[http://github.com/seanbreckenridge/sqlite_backup][sqlite_backup]]
"""
REQUIRES = ["browserexport", "sqlite_backup"]
from my.config import browser as user_config
from my.core import Paths, dataclass
@dataclass
class config(user_config.active_browser):
# paths to sqlite database files which you use actively
# to read from. For example:
# from browserexport.browsers.all import Firefox
# active_databases = Firefox.locate_database()
export_path: Paths
from pathlib import Path
from typing import Sequence, Iterator
from my.core import get_files, Stats
from browserexport.merge import read_visits, Visit
from sqlite_backup import sqlite_backup
from .common import _patch_browserexport_logs
_patch_browserexport_logs()
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
def history() -> Iterator[Visit]:
for ad in inputs():
conn = sqlite_backup(ad)
assert conn is not None
try:
yield from read_visits(conn)
finally:
conn.close()
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

35
my/browser/all.py Normal file
View file

@ -0,0 +1,35 @@
from typing import Iterator
from my.core import Stats
from my.core.source import import_source
from browserexport.merge import merge_visits, Visit
src_export = import_source(module_name="my.browser.export")
src_active = import_source(module_name="my.browser.active_browser")
@src_export
def _visits_export() -> Iterator[Visit]:
from . import export
return export.history()
@src_active
def _visits_active() -> Iterator[Visit]:
from . import active_browser
return active_browser.history()
# NOTE: you can comment out the sources you don't need
def history() -> Iterator[Visit]:
yield from merge_visits([
_visits_active(),
_visits_export(),
])
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

11
my/browser/common.py Normal file
View file

@ -0,0 +1,11 @@
import os
from my.core.util import __NOT_HPI_MODULE__
def _patch_browserexport_logs():
# patch browserexport logs if HPI_LOGS is present
if "HPI_LOGS" in os.environ:
from browserexport.log import setup as setup_browserexport_logger
from my.core.logging import mklevel
setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))

50
my/browser/export.py Normal file
View file

@ -0,0 +1,50 @@
"""
Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
"""
REQUIRES = ["browserexport"]
from my.config import browser as user_config
from my.core import Paths, dataclass
@dataclass
class config(user_config.export):
# path[s]/glob to your backed up browser history sqlite files
export_path: Paths
from pathlib import Path
from typing import Iterator, Sequence, List
from my.core import Stats, get_files, LazyLogger
from my.core.common import mcachew
from browserexport.merge import read_and_merge, Visit
from .common import _patch_browserexport_logs
logger = LazyLogger(__name__, level="warning")
_patch_browserexport_logs()
# all of my backed up databases
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
def _cachew_depends_on() -> List[str]:
return [str(f) for f in inputs()]
@mcachew(depends_on=_cachew_depends_on, logger=logger)
def history() -> Iterator[Visit]:
yield from read_and_merge(inputs())
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

View file

@ -129,3 +129,9 @@ class fbmessenger:
class twitter:
class talon:
export_path: Paths
class browser:
class export:
export_path: Paths = ''
class active_browser:
export_path: Paths = ''

View file

@ -84,6 +84,7 @@ commands =
commands =
pip install -e .[testing,optional]
hpi module install my.browser.export
hpi module install my.orgmode
hpi module install my.endomondo
hpi module install my.github.ghexport
@ -103,6 +104,7 @@ commands =
# todo fuck. -p my.github isn't checking the subpackages?? wtf...
# guess it wants .pyi file??
python3 -m mypy --install-types --non-interactive \
-p my.browser \
-p my.endomondo \
-p my.github.ghexport \
-p my.hypothesis \