keep both splitting types

This commit is contained in:
Eshan Ramesh 2020-05-19 16:18:08 -04:00
parent aeafcd36c9
commit 3375accf33

View file

@ -26,7 +26,17 @@ RESET_COLOR = colorama.Fore.RESET
# Based on Segtok by Florian Leitner # Based on Segtok by Florian Leitner
# https://github.com/fnl/segtok # https://github.com/fnl/segtok
SENTENCE_SPLITTER = re.compile("\n") SENTENCE_SPLITTER = re.compile(
r"""
( # A sentence ends at one of two sequences:
[.!?\u203C\u203D\u2047\u2048\u2049\u3002\uFE52\uFE57\uFF01\uFF0E\uFF1F\uFF61] # Either, a sequence starting with a sentence terminal,
[\'\u2019\"\u201D]? # an optional right quote,
[\]\)]* # optional closing brackets and
\s+ # a sequence of required spaces.
)""",
re.VERBOSE,
)
SENTENCE_SPLITTER_ONLY_NEWLINE = re.compile("\n")
class UserAbort(Exception): class UserAbort(Exception):
@ -252,7 +262,9 @@ def slugify(string):
def split_title(text): def split_title(text):
"""Splits the first sentence off from a text.""" """Splits the first sentence off from a text."""
punkt = SENTENCE_SPLITTER.search(text.strip()) sep = SENTENCE_SPLITTER_ONLY_NEWLINE.search(text.strip())
if not punkt: if not sep:
sep = SENTENCE_SPLITTER.search(text)
if not sep:
return "",text return "",text
return text[: punkt.end()].strip(), text[punkt.end() :].strip() return text[: sep.end()].strip(), text[sep.end() :].strip()