twitter: prefer archive data over twidump for tweets

also add a script to check twitter data
This commit is contained in:
Dima Gerasimov 2022-05-30 21:40:50 +01:00 committed by karlicoss
parent bb4c77612b
commit 946daf40d0
3 changed files with 91 additions and 1 deletions

86
misc/check-twitter.sh Executable file
View file

@ -0,0 +1,86 @@
#!/bin/bash
# just a hacky script to check twitter module behaviour w.r.t. merging and normalising data
# this checks against orger output for @karlicoss data
set -eu
FILE="$1"
function check() {
x="$1"
if [[ $(rg --count "$x" "$FILE") != "1" ]]; then
echo "FAILED! $x"
fi
}
# only in old twitter archive data + test mentions
check '2010-03-24 Wed 10:02.*@GDRussia подлагивает'
# check that old twitter archive data replaces &lt/&gt
check '2011-05-12 Thu 17:51.*set ><'
# this would probs be from twint or something?
check '2013-06-01 Sat 18:48.*<inputfile'
# https://twitter.com/karlicoss/status/363703394201894912
# the quoted acc was suspended and the tweet is only present in archives?
check '2013-08-03 Sat 16:50.*удивительно, как в одном человеке'
# similar
# https://twitter.com/karlicoss/status/712186968382291968
check '2016-03-22 Tue 07:59.*Очень хорошо'
# RTs are missing from twint
# https://twitter.com/karlicoss/status/925968541458759681
check '2017-11-02 Thu 06:11.*RT @dabeaz: A short esoteric Python'
# twint stopped updating at this point
# https://twitter.com/karlicoss/status/1321488603499954177
check '2020-10-28 Wed 16:26.*@jborichevskiy I feel like for me'
# https://twitter.com/karlicoss/status/808769414984331267
# archive doesn't expland links in 'text' by default, check we're doing that in HPI
# NOTE: hmm twint adds an extra whitespace here before the link?
check '2016-12-13 Tue 20:23.*TIL:.*pypi.python.org/pypi/coloredlogs'
# https://twitter.com/karlicoss/status/472151454044917761
# archive isn't explaning images by default
check '2014-05-29 Thu 23:04.*Выколол сингулярность.*pic.twitter.com/M6XRN1n7KW'
# https://twitter.com/karlicoss/status/565648186816335873
# for some reason missing from twint??
check '2015-02-11 Wed 23:06.*separation confirmed'
# mentions were missing from twint at some point, check they are still present..
# https://twitter.com/karlicoss/status/1228225797283966976
check '2020-02-14 Fri 07:53.*thomas536.*looks like a very cool blog'
# just a random timestamp check. RT then reply shortly after -- good check.
# https://twitter.com/karlicoss/status/341512959694082049
check '2013-06-03 Mon 11:13.*RT @osenin'
# https://twitter.com/karlicoss/status/341513515749736448
check '2013-06-03 Mon 11:15.*@osenin'
# def was tweeted at 00:00 MSK, so a good timezone check
# id 550396141914058752
check '2014-12-31 Wed 21:00.*2015 заебал'
# for some reason is gone, and wasn't in twidump/twint
# https://twitter.com/karlicoss/status/1393312193945513985
check '2021-05-14 Fri 21:08.*RT @SNunoPerez: Me explaining Rage.*'
# make sure there is a single occurence (hence, correct tzs)
check 'A short esoteric Python'
# https://twitter.com/karlicoss/status/1499174823272099842
check 'It would be a really good time for countries'
# https://twitter.com/karlicoss/status/1530303537476947968
check 'so there is clearly a pattern'
# TODO check likes as well

View file

@ -35,13 +35,15 @@ def _likes_archive() -> Iterator[Res[Tweet]]:
def tweets() -> Iterator[Res[Tweet]]:
# for tweets, archive data is higher quality
yield from merge_tweets(
_tweets_twint(),
_tweets_archive(),
_tweets_twint(),
)
def likes() -> Iterator[Res[Tweet]]:
# for likes, archive data barely has anything so twint is preferred
yield from merge_tweets(
_likes_twint(),
_likes_archive(),

View file

@ -124,3 +124,5 @@ def likes() -> Iterator[Res[Tweet]]:
elif isinstance(x, _IsFavorire):
yield x.tweet
# TODO maybe should combine all public iterators into a stats()