From ef120bc6435de77a255590ed90661599df2f7812 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Mon, 30 May 2022 23:20:35 +0100
Subject: [PATCH] twitter.talon: expland URLs

---
 misc/check-twitter.sh |  6 ++++++
 my/twitter/talon.py   | 18 +++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh
index f5f26ce..d4cf830 100755
--- a/misc/check-twitter.sh
+++ b/misc/check-twitter.sh
@@ -83,4 +83,10 @@ check 'It would be a really good time for countries'
 # https://twitter.com/karlicoss/status/1530303537476947968
 check 'so there is clearly a pattern'
 
+
+# https://twitter.com/karlicoss/status/1488942357303238673
+# check URL expansion for Talon
+check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode'
+
+
 # TODO check likes as well
diff --git a/my/twitter/talon.py b/my/twitter/talon.py
index f540d14..175a3fe 100644
--- a/my/twitter/talon.py
+++ b/my/twitter/talon.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 
 from dataclasses import dataclass
 from datetime import datetime
+import re
 from typing import Iterator, Sequence, Optional, Dict
 
 import pytz
@@ -98,12 +99,27 @@ def _parse_tweet(row) -> Tweet:
     # and it's created here, so looks like it's properly parsed from the api
     # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
     created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
+    text = row['text']
+
+    # try explanding URLs.. sadly there are no positions in the db
+    urls = row['other_url'].split()
+    if len(urls) > 0:
+        ellipsis = '...'
+        # might have something collapsed
+        # e.g. deepmind.com/blog/article/Comp...
+        # NOTE: need a one character of lookahead to split on ellipsis.. hence ?=
+        for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text):
+            for full in urls:
+                if short in full:
+                    text = text.replace(short + ellipsis, full)
+                    break
+    #
 
     return Tweet(
         id_str=str(row['tweet_id']),
         created_at=created_at,
         screen_name=row['screen_name'],
-        text=row['text'],
+        text=text,
         # todo hmm text sometimes is trimmed with ellipsis? at least urls
         urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0),
     )