twitter.talon: expland URLs
This commit is contained in:
parent
946daf40d0
commit
ef120bc643
2 changed files with 23 additions and 1 deletions
|
@ -83,4 +83,10 @@ check 'It would be a really good time for countries'
|
||||||
# https://twitter.com/karlicoss/status/1530303537476947968
|
# https://twitter.com/karlicoss/status/1530303537476947968
|
||||||
check 'so there is clearly a pattern'
|
check 'so there is clearly a pattern'
|
||||||
|
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/1488942357303238673
|
||||||
|
# check URL expansion for Talon
|
||||||
|
check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode'
|
||||||
|
|
||||||
|
|
||||||
# TODO check likes as well
|
# TODO check likes as well
|
||||||
|
|
|
@ -5,6 +5,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import re
|
||||||
from typing import Iterator, Sequence, Optional, Dict
|
from typing import Iterator, Sequence, Optional, Dict
|
||||||
|
|
||||||
import pytz
|
import pytz
|
||||||
|
@ -98,12 +99,27 @@ def _parse_tweet(row) -> Tweet:
|
||||||
# and it's created here, so looks like it's properly parsed from the api
|
# and it's created here, so looks like it's properly parsed from the api
|
||||||
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
||||||
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
|
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
|
||||||
|
text = row['text']
|
||||||
|
|
||||||
|
# try explanding URLs.. sadly there are no positions in the db
|
||||||
|
urls = row['other_url'].split()
|
||||||
|
if len(urls) > 0:
|
||||||
|
ellipsis = '...'
|
||||||
|
# might have something collapsed
|
||||||
|
# e.g. deepmind.com/blog/article/Comp...
|
||||||
|
# NOTE: need a one character of lookahead to split on ellipsis.. hence ?=
|
||||||
|
for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text):
|
||||||
|
for full in urls:
|
||||||
|
if short in full:
|
||||||
|
text = text.replace(short + ellipsis, full)
|
||||||
|
break
|
||||||
|
#
|
||||||
|
|
||||||
return Tweet(
|
return Tweet(
|
||||||
id_str=str(row['tweet_id']),
|
id_str=str(row['tweet_id']),
|
||||||
created_at=created_at,
|
created_at=created_at,
|
||||||
screen_name=row['screen_name'],
|
screen_name=row['screen_name'],
|
||||||
text=row['text'],
|
text=text,
|
||||||
# todo hmm text sometimes is trimmed with ellipsis? at least urls
|
# todo hmm text sometimes is trimmed with ellipsis? at least urls
|
||||||
urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0),
|
urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0),
|
||||||
)
|
)
|
||||||
|
|
Loading…
Add table
Reference in a new issue