instagram.gdpr: use ZipPath instead of adhoc zipfile methods
this allows using the module more agnostic whether the gpdr archive is packed or unpacked
This commit is contained in:
parent
7c0f304f94
commit
706ec03a3f
2 changed files with 28 additions and 19 deletions
|
@ -164,6 +164,15 @@ class ZipPath(ZipPathBase):
|
|||
assert self.root == other.root, (self.root, other.root)
|
||||
return Path(self.at).relative_to(Path(other.at))
|
||||
|
||||
@property
|
||||
def parts(self) -> Sequence[str]:
|
||||
# messy, but might be ok..
|
||||
return Path(self.filename).parts + Path(self.at).parts
|
||||
|
||||
@property
|
||||
def stem(self) -> str:
|
||||
return Path(self.at).stem
|
||||
|
||||
@property # type: ignore[misc]
|
||||
def __class__(self):
|
||||
return Path
|
||||
|
|
|
@ -56,14 +56,25 @@ def _decode(s: str) -> str:
|
|||
|
||||
import json
|
||||
from typing import Union
|
||||
from ..core.kompress import kopen
|
||||
from ..core.error import Res
|
||||
from ..core.structure import match_structure
|
||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||
last = max(inputs())
|
||||
from ..core.kompress import ZipPath
|
||||
last = ZipPath(max(inputs()))
|
||||
# TODO make sure it works both with plan directory
|
||||
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
||||
# e.g. possible options are:
|
||||
# - if packed things are detected, just return ZipPath
|
||||
# - if packed things are detected, possibly return match_structure_wrapper
|
||||
# it might be a bit tricky because it's a context manager -- who will recycle it?
|
||||
# - if unpacked things are detected, just return the dir as it is
|
||||
# (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs)
|
||||
#
|
||||
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
|
||||
# https://github.com/karlicoss/HPI/pull/175
|
||||
# whereas here I don't need it..
|
||||
# so for now will just implement this adhoc thing and think about properly fixing later
|
||||
|
||||
with kopen(last, 'account_information/personal_information.json') as fo:
|
||||
j = json.load(fo)
|
||||
j = json.loads((last / 'account_information/personal_information.json').read_text())
|
||||
[profile] = j['profile_user']
|
||||
pdata = profile['string_map_data']
|
||||
username = pdata['Username']['value']
|
||||
|
@ -78,26 +89,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
|||
)
|
||||
yield self_user
|
||||
|
||||
# TODO maybe move it to kompress/match_structure?
|
||||
# would be nice to support it without unpacking
|
||||
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
|
||||
# https://github.com/karlicoss/HPI/pull/175
|
||||
# whereas here I don't need it..
|
||||
# so for now will just implement this adhoc thing and think about properly fixing later
|
||||
|
||||
from zipfile import ZipFile
|
||||
z = ZipFile(last)
|
||||
files = [Path(p) for p in z.namelist() if Path(p).match('messages/inbox/*/message_*.json')]
|
||||
files = list(last.rglob('messages/inbox/*/message_*.json'))
|
||||
assert len(files) > 0, last
|
||||
|
||||
buckets = bucket(files, key=lambda p: p.parts[2])
|
||||
buckets = bucket(files, key=lambda p: p.parts[-2])
|
||||
file_map = {k: list(buckets[k]) for k in buckets}
|
||||
|
||||
for fname, ffiles in file_map.items():
|
||||
# sort by file number (.../message_<number>.json)
|
||||
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
|
||||
with kopen(last, str(ffile)) as fo:
|
||||
j = json.load(fo)
|
||||
j = json.loads(ffile.read_text())
|
||||
|
||||
id_len = 10
|
||||
# NOTE: no match in android db/api responses?
|
||||
|
|
Loading…
Add table
Reference in a new issue