twitter.talon: expland URLs

This commit is contained in:
Dima Gerasimov 2022-05-30 23:20:35 +01:00 committed by karlicoss
parent 946daf40d0
commit ef120bc643
2 changed files with 23 additions and 1 deletions

View file

@ -83,4 +83,10 @@ check 'It would be a really good time for countries'
# https://twitter.com/karlicoss/status/1530303537476947968 # https://twitter.com/karlicoss/status/1530303537476947968
check 'so there is clearly a pattern' check 'so there is clearly a pattern'
# https://twitter.com/karlicoss/status/1488942357303238673
# check URL expansion for Talon
check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode'
# TODO check likes as well # TODO check likes as well

View file

@ -5,6 +5,7 @@ from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
import re
from typing import Iterator, Sequence, Optional, Dict from typing import Iterator, Sequence, Optional, Dict
import pytz import pytz
@ -98,12 +99,27 @@ def _parse_tweet(row) -> Tweet:
# and it's created here, so looks like it's properly parsed from the api # and it's created here, so looks like it's properly parsed from the api
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
text = row['text']
# try explanding URLs.. sadly there are no positions in the db
urls = row['other_url'].split()
if len(urls) > 0:
ellipsis = '...'
# might have something collapsed
# e.g. deepmind.com/blog/article/Comp...
# NOTE: need a one character of lookahead to split on ellipsis.. hence ?=
for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text):
for full in urls:
if short in full:
text = text.replace(short + ellipsis, full)
break
#
return Tweet( return Tweet(
id_str=str(row['tweet_id']), id_str=str(row['tweet_id']),
created_at=created_at, created_at=created_at,
screen_name=row['screen_name'], screen_name=row['screen_name'],
text=row['text'], text=text,
# todo hmm text sometimes is trimmed with ellipsis? at least urls # todo hmm text sometimes is trimmed with ellipsis? at least urls
urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0), urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0),
) )