twitter.talon: expland URLs

2022-05-30 23:20:35 +01:00 · 2022-05-30 23:20:35 +01:00 · ef120bc643
commit ef120bc643
parent 946daf40d0
2 changed files with 23 additions and 1 deletions
--- a/misc/check-twitter.sh
+++ b/misc/check-twitter.sh
@ -83,4 +83,10 @@ check 'It would be a really good time for countries'
 # https://twitter.com/karlicoss/status/1530303537476947968
 check 'so there is clearly a pattern'
 # https://twitter.com/karlicoss/status/1488942357303238673
 # check URL expansion for Talon
 check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode'
 # TODO check likes as well
--- a/my/twitter/talon.py
+++ b/my/twitter/talon.py
@ -5,6 +5,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from datetime import datetime
 import re
 from typing import Iterator, Sequence, Optional, Dict
 import pytz
@ -98,12 +99,27 @@ def _parse_tweet(row) -> Tweet:
    # and it's created here, so looks like it's properly parsed from the api
    # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
    created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
    text = row['text']
    # try explanding URLs.. sadly there are no positions in the db
    urls = row['other_url'].split()
    if len(urls) > 0:
        ellipsis = '...'
        # might have something collapsed
        # e.g. deepmind.com/blog/article/Comp...
        # NOTE: need a one character of lookahead to split on ellipsis.. hence ?=
        for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text):
            for full in urls:
                if short in full:
                    text = text.replace(short + ellipsis, full)
                    break
    #
    return Tweet(
        id_str=str(row['tweet_id']),
        created_at=created_at,
        screen_name=row['screen_name'],
-        text=row['text'],
+        text=text,
        # todo hmm text sometimes is trimmed with ellipsis? at least urls
        urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0),
    )