instagram.gdpr: use ZipPath instead of adhoc zipfile methods

this allows using the module more agnostic whether the gpdr archive is packed or unpacked
This commit is contained in:
Dima Gerasimov 2022-04-15 11:59:26 +01:00 committed by karlicoss
parent 7c0f304f94
commit 706ec03a3f
2 changed files with 28 additions and 19 deletions

View file

@ -164,6 +164,15 @@ class ZipPath(ZipPathBase):
assert self.root == other.root, (self.root, other.root) assert self.root == other.root, (self.root, other.root)
return Path(self.at).relative_to(Path(other.at)) return Path(self.at).relative_to(Path(other.at))
@property
def parts(self) -> Sequence[str]:
# messy, but might be ok..
return Path(self.filename).parts + Path(self.at).parts
@property
def stem(self) -> str:
return Path(self.at).stem
@property # type: ignore[misc] @property # type: ignore[misc]
def __class__(self): def __class__(self):
return Path return Path

View file

@ -56,14 +56,25 @@ def _decode(s: str) -> str:
import json import json
from typing import Union from typing import Union
from ..core.kompress import kopen
from ..core.error import Res from ..core.error import Res
from ..core.structure import match_structure
def _entities() -> Iterator[Res[Union[User, _Message]]]: def _entities() -> Iterator[Res[Union[User, _Message]]]:
last = max(inputs()) from ..core.kompress import ZipPath
last = ZipPath(max(inputs()))
# TODO make sure it works both with plan directory
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
# e.g. possible options are:
# - if packed things are detected, just return ZipPath
# - if packed things are detected, possibly return match_structure_wrapper
# it might be a bit tricky because it's a context manager -- who will recycle it?
# - if unpacked things are detected, just return the dir as it is
# (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs)
#
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
# https://github.com/karlicoss/HPI/pull/175
# whereas here I don't need it..
# so for now will just implement this adhoc thing and think about properly fixing later
with kopen(last, 'account_information/personal_information.json') as fo: j = json.loads((last / 'account_information/personal_information.json').read_text())
j = json.load(fo)
[profile] = j['profile_user'] [profile] = j['profile_user']
pdata = profile['string_map_data'] pdata = profile['string_map_data']
username = pdata['Username']['value'] username = pdata['Username']['value']
@ -78,26 +89,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
) )
yield self_user yield self_user
# TODO maybe move it to kompress/match_structure? files = list(last.rglob('messages/inbox/*/message_*.json'))
# would be nice to support it without unpacking
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
# https://github.com/karlicoss/HPI/pull/175
# whereas here I don't need it..
# so for now will just implement this adhoc thing and think about properly fixing later
from zipfile import ZipFile
z = ZipFile(last)
files = [Path(p) for p in z.namelist() if Path(p).match('messages/inbox/*/message_*.json')]
assert len(files) > 0, last assert len(files) > 0, last
buckets = bucket(files, key=lambda p: p.parts[2]) buckets = bucket(files, key=lambda p: p.parts[-2])
file_map = {k: list(buckets[k]) for k in buckets} file_map = {k: list(buckets[k]) for k in buckets}
for fname, ffiles in file_map.items(): for fname, ffiles in file_map.items():
# sort by file number (.../message_<number>.json)
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])): for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
with kopen(last, str(ffile)) as fo: j = json.loads(ffile.read_text())
j = json.load(fo)
id_len = 10 id_len = 10
# NOTE: no match in android db/api responses? # NOTE: no match in android db/api responses?