From ef120bc6435de77a255590ed90661599df2f7812 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 23:20:35 +0100 Subject: [PATCH] twitter.talon: expland URLs --- misc/check-twitter.sh | 6 ++++++ my/twitter/talon.py | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh index f5f26ce..d4cf830 100755 --- a/misc/check-twitter.sh +++ b/misc/check-twitter.sh @@ -83,4 +83,10 @@ check 'It would be a really good time for countries' # https://twitter.com/karlicoss/status/1530303537476947968 check 'so there is clearly a pattern' + +# https://twitter.com/karlicoss/status/1488942357303238673 +# check URL expansion for Talon +check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode' + + # TODO check likes as well diff --git a/my/twitter/talon.py b/my/twitter/talon.py index f540d14..175a3fe 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -5,6 +5,7 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime +import re from typing import Iterator, Sequence, Optional, Dict import pytz @@ -98,12 +99,27 @@ def _parse_tweet(row) -> Tweet: # and it's created here, so looks like it's properly parsed from the api # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) + text = row['text'] + + # try explanding URLs.. sadly there are no positions in the db + urls = row['other_url'].split() + if len(urls) > 0: + ellipsis = '...' + # might have something collapsed + # e.g. deepmind.com/blog/article/Comp... + # NOTE: need a one character of lookahead to split on ellipsis.. hence ?= + for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text): + for full in urls: + if short in full: + text = text.replace(short + ellipsis, full) + break + # return Tweet( id_str=str(row['tweet_id']), created_at=created_at, screen_name=row['screen_name'], - text=row['text'], + text=text, # todo hmm text sometimes is trimmed with ellipsis? at least urls urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0), )