chartbeat-labs · afriedman412 · Apr 2, 2023 · Apr 3, 2023 · Apr 3, 2023 · May 1, 2023
diff --git a/.readthedocs.yml → .readthedocs.yaml b/.readthedocs.yml → .readthedocs.yaml
@@ -4,14 +4,11 @@
 version: 2
 
 python:
-  version: 3.9
   install:
-    - method: pip
-      path: .
     - method: pip
       path: .
       extra_requirements:
-        - dev
+        - docs
   system_packages: true
 
 sphinx:
@@ -23,4 +20,6 @@ formats:
   - pdf
 
 build:
-  image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
diff --git a/src/textacy/constants.py b/src/textacy/constants.py
@@ -21,6 +21,50 @@
 OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"}
 AUX_DEPS: set[str] = {"aux", "auxpass", "neg"}
 
+QUOTATION_MARK_PAIRS = {
+    # """
+    # Ordinal points of the token.is_quote characters, matched up by start and end.
+    # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
+    # source:
+    # either = "\"\'"
+    # start = "“‘```“‘«‹「『„‚"
+    # end = "”’’’’”’»›」』”’"
+    # """
+    (34, 34),  # " "
+    (39, 39),  # ' '
+    (96, 8217),  # ` ’
+    (171, 187),  # « »
+    (8216, 8217),  # ‘ ’
+    (8218, 8217),  # ‚ ’
+    (8220, 8221),  # “ ”
+    (8222, 8221),  # „ ”
+    (8249, 8250),  # ‹ ›
+    (12300, 12301),  # 「 」
+    (12302, 12303),  # 『 』
+    (8220, 34),  # “ "
+    (8216, 34),  # ‘ "
+    (96, 34),  # ` "
+    (8216, 34),  # ‘ "
+    (171, 34),  # « "
+    (8249, 34),  # ‹ "
+    (12300, 34),  # 「 "
+    (12302, 34),  # 『 "
+    (8222, 34),  # „ "
+    (8218, 34),  # ‚ "
+    (34, 8221),  # " ”
+    (34, 8217),  # " ’
+    (34, 10),  # " \n
+    (39, 10),  # ' \n
+    (96, 10),  # ` \n
+    (171, 10),  # « \n
+    (8216, 10),  # ‘ \n
+    (8218, 10),  # ‚ \n
+    (8220, 10),  # “ \n
+    (8249, 10),  # ‹ \n
+    (12300, 10),  # 「 \n
+    (12302, 10),  # 『 \n
+}
+
 REPORTING_VERBS: dict[str, set[str]] = {
     "en": {
         "according",
@@ -196,3 +240,9 @@
 )
 
 RE_ALNUM: Pattern = re.compile(r"[^\W_]+")
+
+# regexes for quote detection prep
+ALL_QUOTES = "‹「`»」‘\"„›”‚’'』『«“"
+DOUBLE_QUOTES = '‹「」»"„『”‚』›«“'
+ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES)
+DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES)
diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py
@@ -9,12 +9,12 @@
 
 import collections
 from operator import attrgetter
-from typing import Iterable, Mapping, Optional, Pattern
+from typing import Iterable, Mapping, Optional, Pattern, Literal
 
-from cytoolz import itertoolz
 from spacy.symbols import (
     AUX,
     VERB,
+    PUNCT,
     agent,
     attr,
     aux,
@@ -30,6 +30,7 @@
     xcomp,
 )
 from spacy.tokens import Doc, Span, Token
+import re
 
 from .. import constants, types, utils
 from . import matches
@@ -202,13 +203,14 @@ def semistructured_statements(
                     )
 
 
-def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
+def direct_quotations(doc: Doc, min_quote_length=4) -> Iterable[DQTriple]:
     """
     Extract direct quotations with an attributable speaker from a document
     using simple rules and patterns. Does not extract indirect or mixed quotations!
 
     Args:
         doc
+        min_quote_length - minimum distance (in tokens) between potentially paired quotation marks.
 
     Yields:
         Next direct quotation in ``doc`` as a (speaker, cue, content) triple.
@@ -217,85 +219,88 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
         Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
         Tagging of Reported Speech in Newspaper Articles".
     """
-    # TODO: train a model to do this instead, maybe similar to entity recognition
     try:
         _reporting_verbs = constants.REPORTING_VERBS[doc.lang_]
     except KeyError:
         raise ValueError(
             f"direct quotation extraction is not implemented for lang='{doc.lang_}', "
             f"only {sorted(constants.REPORTING_VERBS.keys())}"
         )
-    qtok_idxs = [tok.i for tok in doc if tok.is_quote]
-    if len(qtok_idxs) % 2 != 0:
-        raise ValueError(
-            f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; "
-            "given the limitations of this method, it's safest to bail out "
-            "rather than guess which quotation is unclosed"
-        )
-    qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs))
-    for qtok_start_idx, qtok_end_idx in qtok_pair_idxs:
-        content = doc[qtok_start_idx : qtok_end_idx + 1]
+    # pairs up quotation-like characters based on acceptable start/end combos
+    # see constants for more info
+    qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))]
+    qtok_idx_pairs = [(-1, -1)]
+    for n, q in enumerate(qtoks):
+        if (
+            not bool(q.whitespace_)
+            and q.i not in [q_[1] for q_ in qtok_idx_pairs]
+            and q.i > qtok_idx_pairs[-1][1]
+        ):
+            for q_ in qtoks[n + 1 :]:
+                if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS:
+                    qtok_idx_pairs.append((q.i, q_.i))
+                    break
+    qtok_idx_pairs = qtok_idx_pairs[1:]
+
+    def filter_quote_tokens(tok):
+        return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs)
+
+    for qtok_start_idx, qtok_end_idx in qtok_idx_pairs:
+        content = doc[qtok_start_idx:qtok_end_idx]
         cue = None
         speaker = None
-        # filter quotations by content
+
         if (
-            # quotations should have at least a couple tokens
-            # excluding the first/last quotation mark tokens
-            len(content) < 4
+            len(content.text.split()) < min_quote_length
             # filter out titles of books and such, if possible
-            or all(
-                tok.is_title
-                for tok in content
-                # if tok.pos in {NOUN, PROPN}
-                if not (tok.is_punct or tok.is_stop)
-            )
-            # TODO: require closing punctuation before the quotation mark?
-            # content[-2].is_punct is False
+            or all(tok.is_title for tok in content if not (tok.is_punct or tok.is_stop))
         ):
             continue
-        # get window of adjacent/overlapping sentences
-        window_sents = (
-            sent
-            for sent in doc.sents
-            # these boundary cases are a subtle bit of work...
-            if (
-                (sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1)
-                or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx)
-            )
-        )
-        # get candidate cue verbs in window
-        cue_cands = [
-            tok
-            for sent in window_sents
-            for tok in sent
-            if (
-                tok.pos == VERB
+
+        for window_sents in [
+            windower(content, "overlap"),
+            windower(content, "linebreaks"),
+        ]:
+            # get candidate cue verbs in window
+            cue_candidates = [
+                tok
+                for sent in window_sents
+                for tok in sent
+                if tok.pos == VERB
                 and tok.lemma_ in _reporting_verbs
-                # cue verbs must occur *outside* any quotation content
-                and not any(
-                    qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs
-                )
+                and not filter_quote_tokens(tok)
+            ]
+            cue_candidates = sorted(
+                cue_candidates,
+                key=lambda cc: min(
+                    abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)
+                ),
             )
-        ]
-        # sort candidates by proximity to quote content
-        cue_cands = sorted(
-            cue_cands,
-            key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)),
-        )
-        for cue_cand in cue_cands:
-            if cue is not None:
-                break
-            for speaker_cand in cue_cand.children:
-                if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
-                    cue = expand_verb(cue_cand)
-                    speaker = expand_noun(speaker_cand)
+            for cue_cand in cue_candidates:
+                if cue is not None:
                     break
-        if content and cue and speaker:
-            yield DQTriple(
-                speaker=sorted(speaker, key=attrgetter("i")),
-                cue=sorted(cue, key=attrgetter("i")),
-                content=content,
-            )
+                speaker_cands = [
+                    speaker_cand
+                    for speaker_cand in cue_cand.children
+                    if speaker_cand.pos != PUNCT
+                    and not filter_quote_tokens(speaker_cand)
+                    and (
+                        (speaker_cand.i >= qtok_end_idx)
+                        or (speaker_cand.i <= qtok_start_idx)
+                    )
+                ]
+                for speaker_cand in speaker_cands:
+                    if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
+                        cue = expand_verb(cue_cand)
+                        speaker = expand_noun(speaker_cand)
+                        break
+            if content and cue and speaker:
+                yield DQTriple(
+                    speaker=sorted(speaker, key=attrgetter("i")),
+                    cue=sorted(cue, key=attrgetter("i")),
+                    content=doc[qtok_start_idx : qtok_end_idx + 1],
+                )
+                break
 
 
 def expand_noun(tok: Token) -> list[Token]:
@@ -305,7 +310,6 @@ def expand_noun(tok: Token) -> list[Token]:
         child
         for tc in tok_and_conjuncts
         for child in tc.children
-        # TODO: why doesn't compound import from spacy.symbols?
         if child.dep_ == "compound"
     ]
     return tok_and_conjuncts + compounds
@@ -317,3 +321,107 @@ def expand_verb(tok: Token) -> list[Token]:
         child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
     ]
     return [tok] + verb_modifiers
+
+
+def windower(quote: Span, method: Literal["overlap", "linebreaks"]) -> Iterable[Span]:
+    """
+    Finds the range of sentences in which to look for quote attribution.
+
+    3 ways:
+    - "overlap": any sentences that overlap with the quote span
+    - "linebreaks": overlap sentences +/- one sentence, without crossing linebreaks after the quote
+    - None: overlap sentences +/- one sentence,
+
+    Input:
+        quote (Span) - quote to be attributed
+        method (str) - how the sentence range will be determined
+
+    Output:
+        sents (list) - list of sentences
+    """
+    if method == "overlap":
+        return [
+            sent
+            for sent in quote.doc.sents
+            if (sent.start < quote.start < sent.end)
+            or (sent.start < quote.end < sent.end)
+        ]
+    else:
+        sent_indexes = [
+            n
+            for n, s in enumerate(quote.doc.sents)
+            if (s.start <= quote.start <= s.end) or (s.start <= quote.end <= s.end)
+        ]
+
+        i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0
+        j_sent = sent_indexes[-1] + 2
+        sents = list(quote.doc.sents)[i_sent:j_sent]
+        if method == "linebreaks":
+            linebreaks = (
+                [0]
+                + [tok.i for tok in quote.doc if re.match(r"\n", tok.text)]
+                + [quote.doc[-1].i]
+            )
+            linebreak_limits = [
+                lb for lb in linebreaks if sents[0].start < lb <= quote.end + 1
+            ]
+            if linebreak_limits:
+                return [s for s in sents if s.end <= max(linebreak_limits)]
+        return sents
+
+
+def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool = True) -> str:
+    """
+    Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc.
+
+    - replaces consecutive apostrophes with a double quote (no idea why this happens but it does)
+    - adds spaces before or after double quotes that don't have them
+    - if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection.
+    - adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks
+
+    Input:
+        t (str) - text to be prepped, preferably one paragraph
+        fix_plural_possessives (bool) - enables fix_plural_possessives
+
+    Output:
+        t (str) - text prepped for quote detection
+    """
+    if not t:
+        return
+
+    t = t.replace("''", '"')
+    if fix_plural_possessives:
+        t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t)
+    while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p):
+        match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p)
+        if (
+            len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[: match.start()])) % 2
+            != 0
+        ):
+            replacer = '" '
+        else:
+            replacer = ' "'
+        p = p[: match.start()] + replacer + p[match.end() :]
+    if (
+        not (p[0] == "'" and p[-1] == "'")
+        and p[0] in constants.ALL_QUOTES
+        and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0
+    ):
+        p += '"'
+    return p.strip()
+
+
+def prep_document_for_quote_detection(t: str, para_char: str = "\n") -> str:
+    """
+    Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char.
+
+    Input:
+        t (str) - document to prep for quote detection
+        para_char (str) - paragraph boundary in t
+
+    Output:
+        document prepped for quote detection
+    """
+    return para_char.join(
+        [prep_text_for_quote_detection(t) for t in t.split(para_char) if t]
+    )