HPI/misc/check-twitter.sh
Kian-Meng Ang d2ef23fcb4 docs: fix typos
found via `codespell -L copie,datas,pres,fo,tooks,noo,ue,ket,frop`
2023-03-27 03:02:35 +01:00

105 lines
3.7 KiB
Bash
Executable file

#!/bin/bash
# just a hacky script to check twitter module behaviour w.r.t. merging and normalising data
# this checks against orger output for @karlicoss data
set -eu
FILE="$1"
function check() {
x="$1"
if [[ $(rg --count "$x" "$FILE") != "1" ]]; then
echo "FAILED! $x"
fi
}
# only in old twitter archive data + test mentions
check '2010-03-24 Wed 10:02.*@GDRussia подлагивает'
# check that old twitter archive data replaces &lt/&gt
check '2011-05-12 Thu 17:51.*set ><'
# this would probs be from twint or something?
check '2013-06-01 Sat 18:48.*<inputfile'
# https://twitter.com/karlicoss/status/363703394201894912
# the quoted acc was suspended and the tweet is only present in archives?
check '2013-08-03 Sat 16:50.*удивительно, как в одном человеке'
# similar
# https://twitter.com/karlicoss/status/712186968382291968
check '2016-03-22 Tue 07:59.*Очень хорошо'
# RTs are missing from twint
# https://twitter.com/karlicoss/status/925968541458759681
check '2017-11-02 Thu 06:11.*RT @dabeaz: A short esoteric Python'
# twint stopped updating at this point
# https://twitter.com/karlicoss/status/1321488603499954177
check '2020-10-28 Wed 16:26.*@jborichevskiy I feel like for me'
# https://twitter.com/karlicoss/status/808769414984331267
# archive doesn't expland links in 'text' by default, check we're doing that in HPI
# NOTE: hmm twint adds an extra whitespace here before the link?
check '2016-12-13 Tue 20:23.*TIL:.*pypi.python.org/pypi/coloredlogs'
# https://twitter.com/karlicoss/status/472151454044917761
# archive isn't expanding images by default
check '2014-05-29 Thu 23:04.*Выколол сингулярность.*pic.twitter.com/M6XRN1n7KW'
# https://twitter.com/karlicoss/status/565648186816335873
# for some reason missing from twint??
check '2015-02-11 Wed 23:06.*separation confirmed'
# mentions were missing from twint at some point, check they are still present..
# https://twitter.com/karlicoss/status/1228225797283966976
check '2020-02-14 Fri 07:53.*thomas536.*looks like a very cool blog'
# just a random timestamp check. RT then reply shortly after -- good check.
# https://twitter.com/karlicoss/status/341512959694082049
check '2013-06-03 Mon 11:13.*RT @osenin'
# https://twitter.com/karlicoss/status/341513515749736448
check '2013-06-03 Mon 11:15.*@osenin'
# def was tweeted at 00:00 MSK, so a good timezone check
# id 550396141914058752
check '2014-12-31 Wed 21:00.*2015 заебал'
# for some reason is gone, and wasn't in twidump/twint
# https://twitter.com/karlicoss/status/1393312193945513985
check '2021-05-14 Fri 21:08.*RT @SNunoPerez: Me explaining Rage.*'
# make sure there is a single occurrence (hence, correct tzs)
check 'A short esoteric Python'
# https://twitter.com/karlicoss/status/1499174823272099842
check 'It would be a really good time for countries'
# https://twitter.com/karlicoss/status/1530303537476947968
check 'so there is clearly a pattern'
# https://twitter.com/karlicoss/status/1488942357303238673
# check URL expansion for Talon
check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode'
# https://twitter.com/karlicoss/status/349168455964033024
# check link which is only in twidump
check '2013-06-24 Mon 14:13.*RT @gorod095: Нашел недавно в букинист'
# some older statuses, useful to test that all input data is properly detected
check '2010-04-01 Thu 11:34'
check '2010-06-28 Mon 23:42'
# https://twitter.com/karlicoss/status/22916704915
# this one is weird, just disappeared for no reason between 2021-12-22 and 2022-03-15
# and the account isn't suspended etc. maybe it was temporary private or something?
check '2010-09-03 Fri 20:11.*Джобс'
# TODO check likes as well