browser: parse browser history using browserexport (#216)

* browser: parse browser history using browserexport

from seanbreckenridge/HPI module:
1fba8ccf2f/my/browser/export.py
This commit is contained in:
seanbreckenridge 2022-02-13 15:56:05 -08:00 committed by GitHub
parent 059c4ae791
commit 9e5cd60ff2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 198 additions and 24 deletions

View file

@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into
You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works. You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
# Nested Configurations before the doc generation using the block below
** [[file:../my/reddit][my.reddit]]
Reddit data: saved items/comments/upvotes/etc.
# Note: can't be generated as easily since this is a nested configuration object
#+begin_src python
class reddit:
class rexport:
'''
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
class pushshift:
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/browser/][my.browser]]
Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
#+begin_src python
@dataclass
class browser:
class export:
# path[s]/glob to your backed up browser history sqlite files
export_path: Paths
class active_browser:
# paths to sqlite database files which you use actively
# to read from. For example:
# from browserexport.browsers.all import Firefox
# active_databases = Firefox.locate_database()
export_path: Paths
#+end_src
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh. # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
#+begin_src python :dir .. :results output drawer raw :exports result #+begin_src python :dir .. :results output drawer raw :exports result
@ -139,30 +183,6 @@ for cls, p in modules:
# paths[s]/glob to the exported JSON data # paths[s]/glob to the exported JSON data
export_path: Paths export_path: Paths
#+end_src #+end_src
** [[file:../my/reddit][my.reddit]]
Reddit data: saved items/comments/upvotes/etc.
# Note: can't be generated as easily since this is a nested configuration object
#+begin_src python
class reddit:
class rexport:
'''
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
class pushshift:
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/pocket.py][my.pocket]] ** [[file:../my/pocket.py][my.pocket]]
[[https://getpocket.com][Pocket]] bookmarks and highlights [[https://getpocket.com][Pocket]] bookmarks and highlights

View file

@ -0,0 +1,50 @@
"""
Parses active browser history by backing it up with [[http://github.com/seanbreckenridge/sqlite_backup][sqlite_backup]]
"""
REQUIRES = ["browserexport", "sqlite_backup"]
from my.config import browser as user_config
from my.core import Paths, dataclass
@dataclass
class config(user_config.active_browser):
# paths to sqlite database files which you use actively
# to read from. For example:
# from browserexport.browsers.all import Firefox
# active_databases = Firefox.locate_database()
export_path: Paths
from pathlib import Path
from typing import Sequence, Iterator
from my.core import get_files, Stats
from browserexport.merge import read_visits, Visit
from sqlite_backup import sqlite_backup
from .common import _patch_browserexport_logs
_patch_browserexport_logs()
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
def history() -> Iterator[Visit]:
for ad in inputs():
conn = sqlite_backup(ad)
assert conn is not None
try:
yield from read_visits(conn)
finally:
conn.close()
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

35
my/browser/all.py Normal file
View file

@ -0,0 +1,35 @@
from typing import Iterator
from my.core import Stats
from my.core.source import import_source
from browserexport.merge import merge_visits, Visit
src_export = import_source(module_name="my.browser.export")
src_active = import_source(module_name="my.browser.active_browser")
@src_export
def _visits_export() -> Iterator[Visit]:
from . import export
return export.history()
@src_active
def _visits_active() -> Iterator[Visit]:
from . import active_browser
return active_browser.history()
# NOTE: you can comment out the sources you don't need
def history() -> Iterator[Visit]:
yield from merge_visits([
_visits_active(),
_visits_export(),
])
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

11
my/browser/common.py Normal file
View file

@ -0,0 +1,11 @@
import os
from my.core.util import __NOT_HPI_MODULE__
def _patch_browserexport_logs():
# patch browserexport logs if HPI_LOGS is present
if "HPI_LOGS" in os.environ:
from browserexport.log import setup as setup_browserexport_logger
from my.core.logging import mklevel
setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))

50
my/browser/export.py Normal file
View file

@ -0,0 +1,50 @@
"""
Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
"""
REQUIRES = ["browserexport"]
from my.config import browser as user_config
from my.core import Paths, dataclass
@dataclass
class config(user_config.export):
# path[s]/glob to your backed up browser history sqlite files
export_path: Paths
from pathlib import Path
from typing import Iterator, Sequence, List
from my.core import Stats, get_files, LazyLogger
from my.core.common import mcachew
from browserexport.merge import read_and_merge, Visit
from .common import _patch_browserexport_logs
logger = LazyLogger(__name__, level="warning")
_patch_browserexport_logs()
# all of my backed up databases
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
def _cachew_depends_on() -> List[str]:
return [str(f) for f in inputs()]
@mcachew(depends_on=_cachew_depends_on, logger=logger)
def history() -> Iterator[Visit]:
yield from read_and_merge(inputs())
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

View file

@ -129,3 +129,9 @@ class fbmessenger:
class twitter: class twitter:
class talon: class talon:
export_path: Paths export_path: Paths
class browser:
class export:
export_path: Paths = ''
class active_browser:
export_path: Paths = ''

View file

@ -84,6 +84,7 @@ commands =
commands = commands =
pip install -e .[testing,optional] pip install -e .[testing,optional]
hpi module install my.browser.export
hpi module install my.orgmode hpi module install my.orgmode
hpi module install my.endomondo hpi module install my.endomondo
hpi module install my.github.ghexport hpi module install my.github.ghexport
@ -103,6 +104,7 @@ commands =
# todo fuck. -p my.github isn't checking the subpackages?? wtf... # todo fuck. -p my.github isn't checking the subpackages?? wtf...
# guess it wants .pyi file?? # guess it wants .pyi file??
python3 -m mypy --install-types --non-interactive \ python3 -m mypy --install-types --non-interactive \
-p my.browser \
-p my.endomondo \ -p my.endomondo \
-p my.github.ghexport \ -p my.github.ghexport \
-p my.hypothesis \ -p my.hypothesis \