instagram.gdpr: use ZipPath instead of adhoc zipfile methods
this allows using the module more agnostic whether the gpdr archive is packed or unpacked
This commit is contained in:
parent
7c0f304f94
commit
706ec03a3f
2 changed files with 28 additions and 19 deletions
|
@ -164,6 +164,15 @@ class ZipPath(ZipPathBase):
|
||||||
assert self.root == other.root, (self.root, other.root)
|
assert self.root == other.root, (self.root, other.root)
|
||||||
return Path(self.at).relative_to(Path(other.at))
|
return Path(self.at).relative_to(Path(other.at))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parts(self) -> Sequence[str]:
|
||||||
|
# messy, but might be ok..
|
||||||
|
return Path(self.filename).parts + Path(self.at).parts
|
||||||
|
|
||||||
|
@property
|
||||||
|
def stem(self) -> str:
|
||||||
|
return Path(self.at).stem
|
||||||
|
|
||||||
@property # type: ignore[misc]
|
@property # type: ignore[misc]
|
||||||
def __class__(self):
|
def __class__(self):
|
||||||
return Path
|
return Path
|
||||||
|
|
|
@ -56,14 +56,25 @@ def _decode(s: str) -> str:
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ..core.kompress import kopen
|
|
||||||
from ..core.error import Res
|
from ..core.error import Res
|
||||||
from ..core.structure import match_structure
|
|
||||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
last = max(inputs())
|
from ..core.kompress import ZipPath
|
||||||
|
last = ZipPath(max(inputs()))
|
||||||
|
# TODO make sure it works both with plan directory
|
||||||
|
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
||||||
|
# e.g. possible options are:
|
||||||
|
# - if packed things are detected, just return ZipPath
|
||||||
|
# - if packed things are detected, possibly return match_structure_wrapper
|
||||||
|
# it might be a bit tricky because it's a context manager -- who will recycle it?
|
||||||
|
# - if unpacked things are detected, just return the dir as it is
|
||||||
|
# (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs)
|
||||||
|
#
|
||||||
|
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
|
||||||
|
# https://github.com/karlicoss/HPI/pull/175
|
||||||
|
# whereas here I don't need it..
|
||||||
|
# so for now will just implement this adhoc thing and think about properly fixing later
|
||||||
|
|
||||||
with kopen(last, 'account_information/personal_information.json') as fo:
|
j = json.loads((last / 'account_information/personal_information.json').read_text())
|
||||||
j = json.load(fo)
|
|
||||||
[profile] = j['profile_user']
|
[profile] = j['profile_user']
|
||||||
pdata = profile['string_map_data']
|
pdata = profile['string_map_data']
|
||||||
username = pdata['Username']['value']
|
username = pdata['Username']['value']
|
||||||
|
@ -78,26 +89,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
)
|
)
|
||||||
yield self_user
|
yield self_user
|
||||||
|
|
||||||
# TODO maybe move it to kompress/match_structure?
|
files = list(last.rglob('messages/inbox/*/message_*.json'))
|
||||||
# would be nice to support it without unpacking
|
|
||||||
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
|
|
||||||
# https://github.com/karlicoss/HPI/pull/175
|
|
||||||
# whereas here I don't need it..
|
|
||||||
# so for now will just implement this adhoc thing and think about properly fixing later
|
|
||||||
|
|
||||||
from zipfile import ZipFile
|
|
||||||
z = ZipFile(last)
|
|
||||||
files = [Path(p) for p in z.namelist() if Path(p).match('messages/inbox/*/message_*.json')]
|
|
||||||
assert len(files) > 0, last
|
assert len(files) > 0, last
|
||||||
|
|
||||||
buckets = bucket(files, key=lambda p: p.parts[2])
|
buckets = bucket(files, key=lambda p: p.parts[-2])
|
||||||
file_map = {k: list(buckets[k]) for k in buckets}
|
file_map = {k: list(buckets[k]) for k in buckets}
|
||||||
|
|
||||||
for fname, ffiles in file_map.items():
|
for fname, ffiles in file_map.items():
|
||||||
# sort by file number (.../message_<number>.json)
|
|
||||||
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
|
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
|
||||||
with kopen(last, str(ffile)) as fo:
|
j = json.loads(ffile.read_text())
|
||||||
j = json.load(fo)
|
|
||||||
|
|
||||||
id_len = 10
|
id_len = 10
|
||||||
# NOTE: no match in android db/api responses?
|
# NOTE: no match in android db/api responses?
|
||||||
|
|
Loading…
Add table
Reference in a new issue