Skip to content

Improve quote detection by finding start and end quotation marks #380

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
9 changes: 4 additions & 5 deletions .readthedocs.yml → .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,11 @@
version: 2

python:
version: 3.9
install:
- method: pip
path: .
- method: pip
path: .
extra_requirements:
- dev
- docs
system_packages: true

sphinx:
Expand All @@ -23,4 +20,6 @@ formats:
- pdf

build:
image: latest
os: ubuntu-22.04
tools:
python: "3.9"
50 changes: 50 additions & 0 deletions src/textacy/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,50 @@
OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"}
AUX_DEPS: set[str] = {"aux", "auxpass", "neg"}

QUOTATION_MARK_PAIRS = {
# """
# Ordinal points of the token.is_quote characters, matched up by start and end.
# Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
# source:
# either = "\"\'"
# start = "“‘```“‘«‹「『„‚"
# end = "”’’’’”’»›」』”’"
# """
(34, 34), # " "
(39, 39), # ' '
(96, 8217), # ` ’
(171, 187), # « »
(8216, 8217), # ‘ ’
(8218, 8217), # ‚ ’
(8220, 8221), # “ ”
(8222, 8221), # „ ”
(8249, 8250), # ‹ ›
(12300, 12301), # 「 」
(12302, 12303), # 『 』
(8220, 34), # “ "
(8216, 34), # ‘ "
(96, 34), # ` "
(8216, 34), # ‘ "
(171, 34), # « "
(8249, 34), # ‹ "
(12300, 34), # 「 "
(12302, 34), # 『 "
(8222, 34), # „ "
(8218, 34), # ‚ "
(34, 8221), # " ”
(34, 8217), # " ’
(34, 10), # " \n
(39, 10), # ' \n
(96, 10), # ` \n
(171, 10), # « \n
(8216, 10), # ‘ \n
(8218, 10), # ‚ \n
(8220, 10), # “ \n
(8249, 10), # ‹ \n
(12300, 10), # 「 \n
(12302, 10), # 『 \n
}

REPORTING_VERBS: dict[str, set[str]] = {
"en": {
"according",
Expand Down Expand Up @@ -196,3 +240,9 @@
)

RE_ALNUM: Pattern = re.compile(r"[^\W_]+")

# regexes for quote detection prep
ALL_QUOTES = "‹「`»」‘\"„›”‚’'』『«“"
DOUBLE_QUOTES = '‹「」»"„『”‚』›«“'
ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES)
DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES)
242 changes: 175 additions & 67 deletions src/textacy/extract/triples.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

import collections
from operator import attrgetter
from typing import Iterable, Mapping, Optional, Pattern
from typing import Iterable, Mapping, Optional, Pattern, Literal

from cytoolz import itertoolz
from spacy.symbols import (
AUX,
VERB,
PUNCT,
agent,
attr,
aux,
Expand All @@ -30,6 +30,7 @@
xcomp,
)
from spacy.tokens import Doc, Span, Token
import re

from .. import constants, types, utils
from . import matches
Expand Down Expand Up @@ -202,13 +203,14 @@ def semistructured_statements(
)


def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
def direct_quotations(doc: Doc, min_quote_length=4) -> Iterable[DQTriple]:
"""
Extract direct quotations with an attributable speaker from a document
using simple rules and patterns. Does not extract indirect or mixed quotations!

Args:
doc
min_quote_length - minimum distance (in tokens) between potentially paired quotation marks.

Yields:
Next direct quotation in ``doc`` as a (speaker, cue, content) triple.
Expand All @@ -217,85 +219,88 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
Tagging of Reported Speech in Newspaper Articles".
"""
# TODO: train a model to do this instead, maybe similar to entity recognition
try:
_reporting_verbs = constants.REPORTING_VERBS[doc.lang_]
except KeyError:
raise ValueError(
f"direct quotation extraction is not implemented for lang='{doc.lang_}', "
f"only {sorted(constants.REPORTING_VERBS.keys())}"
)
qtok_idxs = [tok.i for tok in doc if tok.is_quote]
if len(qtok_idxs) % 2 != 0:
raise ValueError(
f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; "
"given the limitations of this method, it's safest to bail out "
"rather than guess which quotation is unclosed"
)
qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs))
for qtok_start_idx, qtok_end_idx in qtok_pair_idxs:
content = doc[qtok_start_idx : qtok_end_idx + 1]
# pairs up quotation-like characters based on acceptable start/end combos
# see constants for more info
qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))]
qtok_idx_pairs = [(-1, -1)]
for n, q in enumerate(qtoks):
if (
not bool(q.whitespace_)
and q.i not in [q_[1] for q_ in qtok_idx_pairs]
and q.i > qtok_idx_pairs[-1][1]
):
for q_ in qtoks[n + 1 :]:
if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS:
qtok_idx_pairs.append((q.i, q_.i))
break
qtok_idx_pairs = qtok_idx_pairs[1:]

def filter_quote_tokens(tok):
return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs)

for qtok_start_idx, qtok_end_idx in qtok_idx_pairs:
content = doc[qtok_start_idx:qtok_end_idx]
cue = None
speaker = None
# filter quotations by content

if (
# quotations should have at least a couple tokens
# excluding the first/last quotation mark tokens
len(content) < 4
len(content.text.split()) < min_quote_length
# filter out titles of books and such, if possible
or all(
tok.is_title
for tok in content
# if tok.pos in {NOUN, PROPN}
if not (tok.is_punct or tok.is_stop)
)
# TODO: require closing punctuation before the quotation mark?
# content[-2].is_punct is False
or all(tok.is_title for tok in content if not (tok.is_punct or tok.is_stop))
):
continue
# get window of adjacent/overlapping sentences
window_sents = (
sent
for sent in doc.sents
# these boundary cases are a subtle bit of work...
if (
(sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1)
or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx)
)
)
# get candidate cue verbs in window
cue_cands = [
tok
for sent in window_sents
for tok in sent
if (
tok.pos == VERB

for window_sents in [
windower(content, "overlap"),
windower(content, "linebreaks"),
]:
# get candidate cue verbs in window
cue_candidates = [
tok
for sent in window_sents
for tok in sent
if tok.pos == VERB
and tok.lemma_ in _reporting_verbs
# cue verbs must occur *outside* any quotation content
and not any(
qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs
)
and not filter_quote_tokens(tok)
]
cue_candidates = sorted(
cue_candidates,
key=lambda cc: min(
abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)
),
)
]
# sort candidates by proximity to quote content
cue_cands = sorted(
cue_cands,
key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)),
)
for cue_cand in cue_cands:
if cue is not None:
break
for speaker_cand in cue_cand.children:
if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
cue = expand_verb(cue_cand)
speaker = expand_noun(speaker_cand)
for cue_cand in cue_candidates:
if cue is not None:
break
if content and cue and speaker:
yield DQTriple(
speaker=sorted(speaker, key=attrgetter("i")),
cue=sorted(cue, key=attrgetter("i")),
content=content,
)
speaker_cands = [
speaker_cand
for speaker_cand in cue_cand.children
if speaker_cand.pos != PUNCT
and not filter_quote_tokens(speaker_cand)
and (
(speaker_cand.i >= qtok_end_idx)
or (speaker_cand.i <= qtok_start_idx)
)
]
for speaker_cand in speaker_cands:
if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
cue = expand_verb(cue_cand)
speaker = expand_noun(speaker_cand)
break
if content and cue and speaker:
yield DQTriple(
speaker=sorted(speaker, key=attrgetter("i")),
cue=sorted(cue, key=attrgetter("i")),
content=doc[qtok_start_idx : qtok_end_idx + 1],
)
break


def expand_noun(tok: Token) -> list[Token]:
Expand All @@ -305,7 +310,6 @@ def expand_noun(tok: Token) -> list[Token]:
child
for tc in tok_and_conjuncts
for child in tc.children
# TODO: why doesn't compound import from spacy.symbols?
if child.dep_ == "compound"
]
return tok_and_conjuncts + compounds
Expand All @@ -317,3 +321,107 @@ def expand_verb(tok: Token) -> list[Token]:
child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
]
return [tok] + verb_modifiers


def windower(quote: Span, method: Literal["overlap", "linebreaks"]) -> Iterable[Span]:
"""
Finds the range of sentences in which to look for quote attribution.

3 ways:
- "overlap": any sentences that overlap with the quote span
- "linebreaks": overlap sentences +/- one sentence, without crossing linebreaks after the quote
- None: overlap sentences +/- one sentence,

Input:
quote (Span) - quote to be attributed
method (str) - how the sentence range will be determined

Output:
sents (list) - list of sentences
"""
if method == "overlap":
return [
sent
for sent in quote.doc.sents
if (sent.start < quote.start < sent.end)
or (sent.start < quote.end < sent.end)
]
else:
sent_indexes = [
n
for n, s in enumerate(quote.doc.sents)
if (s.start <= quote.start <= s.end) or (s.start <= quote.end <= s.end)
]

i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0
j_sent = sent_indexes[-1] + 2
sents = list(quote.doc.sents)[i_sent:j_sent]
if method == "linebreaks":
linebreaks = (
[0]
+ [tok.i for tok in quote.doc if re.match(r"\n", tok.text)]
+ [quote.doc[-1].i]
)
linebreak_limits = [
lb for lb in linebreaks if sents[0].start < lb <= quote.end + 1
]
if linebreak_limits:
return [s for s in sents if s.end <= max(linebreak_limits)]
return sents


def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool = True) -> str:
"""
Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc.

- replaces consecutive apostrophes with a double quote (no idea why this happens but it does)
- adds spaces before or after double quotes that don't have them
- if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection.
- adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks

Input:
t (str) - text to be prepped, preferably one paragraph
fix_plural_possessives (bool) - enables fix_plural_possessives

Output:
t (str) - text prepped for quote detection
"""
if not t:
return

t = t.replace("''", '"')
if fix_plural_possessives:
t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t)
while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p):
match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p)
if (
len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[: match.start()])) % 2
!= 0
):
replacer = '" '
else:
replacer = ' "'
p = p[: match.start()] + replacer + p[match.end() :]
if (
not (p[0] == "'" and p[-1] == "'")
and p[0] in constants.ALL_QUOTES
and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0
):
p += '"'
return p.strip()


def prep_document_for_quote_detection(t: str, para_char: str = "\n") -> str:
"""
Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char.

Input:
t (str) - document to prep for quote detection
para_char (str) - paragraph boundary in t

Output:
document prepped for quote detection
"""
return para_char.join(
[prep_text_for_quote_detection(t) for t in t.split(para_char) if t]
)
Loading