Skip to content

Commit f428a39

Browse files
committed
Use new text annotation format
See https://github.com/nk2028/tshet-uinh-data/wiki/%E6%A0%A1%E5%8B%98%E7%AC%A6%E8%99%9F%E6%A0%BC%E5%BC%8F The fields 反切 and 字頭 in 廣韻.csv, as well as all source data tables, now use this standardized format.
1 parent 525af4f commit f428a39

File tree

6 files changed

+26000
-25939
lines changed

6 files changed

+26000
-25939
lines changed

build.py

Lines changed: 95 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from collections.abc import Iterable
12
import csv
23
import dataclasses
34
from dataclasses import dataclass
@@ -89,14 +90,83 @@ def load_patches() -> dict[tuple[str, str], Patch]:
8990
return patches
9091

9192

93+
def split_head_with_ids(s: str) -> tuple[str, str]:
94+
if not s:
95+
raise ValueError('empty string')
96+
if s[0] in (
97+
'⿰',
98+
'⿱',
99+
'⿴',
100+
'⿵',
101+
'⿶',
102+
'⿷',
103+
'⿸',
104+
'⿹',
105+
'⿺',
106+
'⿻',
107+
'⿼',
108+
'⿽',
109+
'㇯',
110+
):
111+
num_parts = 2
112+
elif s[0] in ('⿲', '⿳'):
113+
num_parts = 3
114+
elif s[0] in ('⿾', '⿿', '〾'):
115+
num_parts = 1
116+
else:
117+
return s[0], s[1:]
118+
idc = s[0]
119+
parts = []
120+
rest = s[1:]
121+
for i in range(num_parts):
122+
# if not rest:
123+
# break
124+
part, rest = split_head_with_ids(rest)
125+
parts.append(part)
126+
return idc + ''.join(parts), rest
127+
128+
129+
def iter_chars_with_ids(s: str) -> Iterable[str]:
130+
while s:
131+
head, s = split_head_with_ids(s)
132+
yield head
133+
134+
135+
# NOTE Only handles simple annotations for now.
136+
def remove_annotations(original: str) -> str:
137+
original = original.replace('`', '')
138+
chars = list(iter_chars_with_ids(original))
139+
n = len(chars)
140+
removable = [False] * n
141+
i = 0
142+
while i < len(chars):
143+
ch = chars[i]
144+
if ch in ('[', ']'):
145+
removable[i] = True
146+
i += 1
147+
elif ch == '{':
148+
j = chars.index('}', i + 1)
149+
removable[i : j + 1] = (True,) * (j + 1 - i)
150+
i = j + 1
151+
elif ch == '〈':
152+
j = chars.index('〉', i + 1)
153+
removable[i] = removable[j] = True
154+
k = j - i - 1
155+
assert not any(removable[i - k : i])
156+
removable[i - k : i] = (True,) * k
157+
i = j + 1
158+
else:
159+
i += 1
160+
return ''.join(ch for ch, rm in zip(chars, removable) if not rm)
161+
162+
92163
@dataclass
93164
class 廣韻Row:
94165
小韻號: str
95166
小韻字號: str
96167
韻目原貌: str
97168
音韻地位: str
98169
反切: str
99-
字頭原貌: str
100170
字頭: str
101171
字頭說明: str
102172
釋義: str
@@ -124,7 +194,7 @@ def main():
124194
poem_小韻內字序 = 字序_data[字序_key].poem_小韻內字序
125195
if not poem_小韻內字序:
126196
poem_反切 = poem_data[(原書小韻號, '1')]['廣韻反切(覈校後)']
127-
含原貌字頭 = ''
197+
字頭 = ''
128198
釋義 = ''
129199
釋義參照 = ''
130200
else:
@@ -139,7 +209,7 @@ def main():
139209
字頭覈校說明,
140210
poem_反切,
141211
字頭原貌,
142-
含原貌字頭,
212+
字頭,
143213
釋義,
144214
釋義補充,
145215
韻目原貌,
@@ -156,7 +226,7 @@ def main():
156226
)
157227
)
158228
if 字頭覈校說明 == '校':
159-
含原貌字頭 = f'[{字頭原貌}/{含原貌字頭}]'
229+
字頭 = f'{字頭原貌}{字頭}'
160230
if not 釋義:
161231
釋義參照 = '下'
162232
elif 釋義補充:
@@ -167,22 +237,22 @@ def main():
167237
# 修正
168238
字頭說明 = ''
169239
if (patch := patches.get(字序_key)) is not None:
170-
assert patch.原字頭 == 含原貌字頭, (
171-
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 字 is "{含原貌字頭}"'
240+
assert patch.原字頭 == 字頭, (
241+
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 字 is "{字頭}"'
172242
)
173243
patch_coverage.add(字序_key)
174244
assert patch.校正字頭, (
175245
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but 校正字頭 is missing'
176246
)
177-
if patch.校正字頭.startswith('['):
178-
assert re.fullmatch(r'\[.+/.+\]', patch.校正字頭), (
179-
f'invalid 校正字頭: "{patch.校正字頭}"'
180-
)
247+
# TODO Stricter format check
248+
assert re.fullmatch(
249+
r'{.+}|[.+]|.+〈.+〉|[^{}[]〈〉]+', patch.校正字頭
250+
), f'invalid 校正字頭: "{patch.校正字頭}"'
181251
if '~' in patch.校正字頭:
182-
assert not 含原貌字頭.startswith('['), (
183-
f'cannot use "~" in 校正字頭 when 字頭 contains correction: "{含原貌字頭}"'
252+
assert 字頭 and 字頭[-1] not in tuple('}]〉'), (
253+
f'cannot use "~" in 校正字頭 when 字頭 contains correction or is empty: "{字頭}"'
184254
)
185-
含原貌字頭 = patch.校正字頭.replace('~', 含原貌字頭)
255+
字頭 = patch.校正字頭.replace('~', 字頭)
186256

187257
# 字頭說明 is an added field, thus it does not have an original value
188258
字頭說明 = patch.字頭說明
@@ -191,32 +261,29 @@ def main():
191261
assert patch.原釋義 == 釋義, (
192262
f'patching 釋義 on 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 釋義 is "{釋義}"'
193263
)
194-
corrected = re.sub(r'\[.+?/(?:-|(.+?))\]|[{}]', r'\1', patch.校正釋義)
195-
釋義 = corrected
264+
釋義 = remove_annotations(patch.校正釋義)
196265
if patch.校正釋義參照 or patch.原釋義參照:
197266
assert patch.原釋義參照 == 釋義參照, (
198267
f'patching 釋義參照 on 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 釋義參照 is "{釋義參照}"'
199268
)
200269
釋義參照 = patch.校正釋義參照
201-
elif 字序_data[字序_key].sbgy_字.endswith('/-]'):
202-
assert not 含原貌字頭.startswith('[')
203-
含原貌字頭 = f'[{含原貌字頭}/-]'
270+
elif 字序_data[字序_key].sbgy_字.endswith(''):
271+
assert 字頭[-1:] not in tuple('}]〉')
272+
字頭 = f'{字頭}'
204273

205274
字_check = 字序_data[字序_key].
206-
assert 含原貌字頭 == 字_check, (
207-
f'字頭 mismatch between 字序表 and (patched) 廣韻 data: "{字_check}" != "{含原貌字頭}" (小韻 {原書小韻號}/{小韻字號})'
275+
assert 字頭 == 字_check, (
276+
f'字頭 mismatch between 字序表 and (patched) 廣韻 data: "{字_check}" != "{字頭}" (小韻 {原書小韻號}/{小韻字號})'
208277
)
209-
if 含原貌字頭.startswith('['):
210-
字頭原貌, 字頭 = 含原貌字頭[1:-1].split('/')
211-
字頭 = '' if 字頭 == '-' else 字頭
212-
字頭原貌 = '' if 字頭原貌 == '-' else 字頭原貌
278+
if 字頭[-1] in ('}', ']'):
279+
字頭或原貌 = 字頭[1:-1]
280+
elif 字頭[-1] == '〉':
281+
字頭或原貌 = 字頭[字頭.index('〈') + 1 : -1]
213282
else:
214-
字頭 = 含原貌字頭
215-
字頭原貌 = ''
283+
字頭或原貌 = 字頭
216284

217285
# 小韻號
218286
# NOTE 字頭 & 細分轄字 in 小韻表.tsv does not contain 字頭原貌 (yet)
219-
字頭或原貌 = 字頭 or 字頭原貌
220287
if 原書小韻號 in 細分號_by_原書小韻:
221288
for 細分 in 細分號_by_原書小韻[原書小韻號]:
222289
小韻號 = 原書小韻號 + 細分
@@ -246,7 +313,7 @@ def main():
246313

247314
# 釋義中反切
248315
if 小韻字號 == '1' and 反切:
249-
反切原貌 = re.sub(r'\[.\]|<.>|⦉.⦊|\(.\)|⦅.⦆', '', 反切)
316+
反切原貌 = re.sub(r'[.]|〈.〉|〘.〙|(.)|⦅.⦆', '', 反切)
250317
if 反切原貌 != poem_反切:
251318
assert 釋義.count(poem_反切 + '切') == 1, (
252319
f'釋義 not containing {反切}切 exactly once: {釋義}'
@@ -259,7 +326,6 @@ def main():
259326
韻目原貌,
260327
音韻地位,
261328
反切,
262-
字頭原貌,
263329
字頭,
264330
字頭說明,
265331
釋義,

check.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
)
1212
PATTERN_反切 = re.compile(
1313
r"""(?x)(
14-
\[.\] | # 脫字
15-
. ( <.> | ⦉.⦊ | \(.\) | ⦅.⦆ )* # 原貌及校正
14+
[.] | # 脫字
15+
. ( 〈.〉 | 〘.〙 | (.) | ⦅.⦆ )* # 原貌及校正
1616
){2}"""
1717
)
1818
PATTERN_IDC = re.compile(r'[\u2ff0-\u2fff\u303e\u31ef]')
@@ -29,7 +29,7 @@ def contains_ascii(s: str):
2929
with open('韻書/廣韻.csv') as f:
3030
assert (
3131
next(f).rstrip('\n')
32-
== '小韻號,小韻字號,韻目原貌,音韻地位,反切,字頭原貌,字頭,字頭說明,釋義,釋義參照'
32+
== '小韻號,小韻字號,韻目原貌,音韻地位,反切,字頭,字頭說明,釋義,釋義參照'
3333
)
3434
for line in f:
3535
(
@@ -38,7 +38,6 @@ def contains_ascii(s: str):
3838
韻目原貌,
3939
音韻地位描述,
4040
反切,
41-
字頭原貌,
4241
字頭,
4342
字頭說明,
4443
釋義,
@@ -50,13 +49,9 @@ def contains_ascii(s: str):
5049
f'invalid 小韻字號: {小韻字號}'
5150
)
5251
assert len(韻目原貌) == 1, f'invalid 韻目原𩩕: {韻目原貌}'
53-
assert 字頭原貌 != 字頭, f'字頭原貌 same as 字頭: {字頭}'
54-
for field, in (('字頭原貌', 字頭原貌), ('字頭', 字頭)):
55-
if not :
56-
continue
57-
assert != '-' and (len() == 1 or PATTERN_IDC.match()), (
58-
f'invalid {field}: {}'
59-
)
52+
assert re.fullmatch(r'{.+}|[.+]|.+〈.+〉|[^{}[]〈〉]+', 字頭), (
53+
f'invalid 字頭: {字頭}'
54+
)
6055

6156
assert PATTERN_描述.fullmatch(音韻地位描述) is not None, (
6257
f'invalid 音韻地位: {音韻地位描述}'

src/patches.csv

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,38 @@
11
原書小韻號,小韻字號,原字頭,校正字頭,原釋義,校正釋義,原釋義參照,校正釋義參照,字頭說明,備注
22
11,2,,融,,上同,,上,,左下為「𢆉」形,poem表因未入U而缺
33
130,4,襹,~,襹毛羽衣皃,𧞬襹毛羽衣皃,,,,poem表承「宋本廣韻データ」用 PUA 字元 U+EE42,當為 U+277AC「𧞬」
4-
141,1,𤿎,[~/𢻹],,,,,,《形聲考》校
4+
141,1,𤿎,~〈𢻹〉,,,,,,《形聲考》校
55
177,6,,𢊿,,上同,,上,,poem表作「⿸广⿳𥫗⺫攵」,因未入U而缺
6-
213,12,婔,[~/-],,,,,「婓」(滂三C微平)之或體,周祖謨校
7-
231,13,㶛,[~/-],,,,,本紐重出當刪,周祖謨校
8-
318,1,攜,~,提也離也又姓出何氏姓苑戶圭切二十四三,提也離也又姓出何氏姓苑{戶圭}切二十[四/三],,,,poem表釋義誤錄
6+
213,12,婔,{~},,,,,「婓」(滂三C微平)之或體,周祖謨校
7+
231,13,㶛,{~},,,,,本紐重出當刪,周祖謨校
8+
318,1,攜,~,提也離也又姓出何氏姓苑戶圭切二十四三,提也離也又姓出何氏姓苑`戶圭`切二十四〈三〉,,,,poem表釋義誤錄
99
511,3,𩁚,𩁢,,,,,,poem表兩字形顛倒
1010
511,4,𩁢,𩁚,,,,,,poem表兩字形顛倒
11-
597,1,𤜼,[~/-],,,,,「犳」(章開三陽入)之訛字,無效小韻
12-
646,1,𡰝,[~/𡰖],,,,,,《形聲考》校
11+
597,1,𤜼,{~},,,,,「犳」(章開三陽入)之訛字,無效小韻
12+
646,1,𡰝,~〈𡰖〉,,,,,,《形聲考》校
1313
949,9,䔖,~,,,,上,,poem表遺漏釋義補充
14-
961,1a1,,,,女字,,,,周祖謨補
15-
1380,2,皷,[~/鼓],說文曰郭也春分之音萬物郭皮甲而出故謂之皷周禮六皷靁皷靈皷路皷鼖皷鼛皷晉皷亦作鼔,說文曰郭也春分之音萬物郭皮甲而出故謂之鼓周禮六鼓靁鼓靈鼓路鼓鼖鼓鼛鼓晉鼓亦作𡔷,,,,周祖謨校;poem表此二字認同有誤
14+
961,1a1,,[嬹],,女字,,,,周祖謨補
15+
1380,2,皷,~〈鼓〉,說文曰郭也春分之音萬物郭皮甲而出故謂之皷周禮六皷靁皷靈皷路皷鼖皷鼛皷晉皷亦作鼔,說文曰郭也春分之音萬物郭皮甲而出故謂之鼓周禮六鼓靁鼓靈鼓路鼓鼖鼓鼛鼓晉鼓亦作𡔷,,,,周祖謨校;poem表此二字認同有誤
1616
1380,3,鼓,鼔,說文曰擊皷也,說文曰擊鼓也,,,,poem表此二字認同有誤
1717
1619,2,𤣗,~,𤣗,,,下,,poem表釋義誤錄
1818
1883,2,,⿱𱡘正,,俗,,上,,poem表因未入U而缺
19-
1929,11,𦱙,[~/莥],蔨實亦作𦶆,蔨實亦作[莥/𦱙],,,,周祖謨校
20-
2021,1,㶒,[~/-],,,,,「㴸」之音,「㶒」(書開三侵上)之字,無效小韻
21-
2046,17,𣄉,[~/𣃳],掩光又於葉切,[掩光/掩也]又於葉切,,,,周祖謨校;《形聲考》云蓋為「掩」之變
22-
2369,1,計,~,籌計說文會也筭也又姓後漢有計子古詣切十二,籌計說文會也筭也又姓後漢有計子勳{古詣切}十二,,,,poem表承「宋本廣韻データ」用 PUA 字元 U+ECE0,當為 U+52F3「勳」
19+
1929,11,𦱙,~〈莥〉,蔨實亦作𦶆,蔨實亦作莥〈𦱙〉,,,,周祖謨校
20+
2021,1,㶒,{~},,,,,「㴸」之音,「㶒」(書開三侵上)之字,無效小韻
21+
2046,17,𣄉,~〈𣃳〉,掩光又於葉切,掩光〈也〉又於葉切,,,,周祖謨校;《形聲考》云蓋為「掩」之變
22+
2369,1,計,~,籌計說文會也筭也又姓後漢有計子古詣切十二,籌計說文會也筭也又姓後漢有計子勳`古詣`切十二,,,,poem表承「宋本廣韻データ」用 PUA 字元 U+ECE0,當為 U+52F3「勳」
2323
2533,2,概,槩,,,,,,取消poem表「部件換位」調整,與整理反切之用字相統一
24-
2988,1,盛,~,多也長也又姓後漢西羌傳有北海太守盛苞其先姓奭避元帝諱改姓盛承正切又音成三,多也長也又姓後漢西羌傳有北[海/地]太守盛苞其先姓奭避元帝諱改姓盛{承正}切又音成三,,,,周祖謨校
25-
2991,1,𣢝,[~/欦],,,,,,《形聲考》校
26-
3113,1,馾,~,冠幘一曰馬步近前丁紺切三,[冠幘一曰馬步近前/馬睡皃]{丁紺}切[三/四],,,,周祖謨校
27-
3113,1a1,,,,冠幘近前,,,,周祖謨補
24+
2988,1,盛,~,多也長也又姓後漢西羌傳有北海太守盛苞其先姓奭避元帝諱改姓盛承正切又音成三,多也長也又姓後漢西羌傳有北海〈地〉太守盛苞其先姓奭避元帝諱改姓盛`承正`切又音成三,,,,周祖謨校
25+
2991,1,𣢝,~〈欦〉,,,,,,《形聲考》校
26+
3113,1,馾,~,冠幘一曰馬步近前丁紺切三,冠幘一曰馬步近前}[馬睡皃]`丁紺`切三〈四〉,,,,周祖謨校
27+
3113,1a1,,[帎],,冠幘近前,,,,周祖謨補
2828
3276,15,,⿰隺犬,,至也高也,,,,poem表因未入U而缺
2929
3291,7,,⿱芖雨,,俗,,上,,poem表作「⿱共雨」,因未入U而缺
3030
3292,2,鷝,鵯,,,,,,poem表誤校;此實為偽字,保持原樣即可
31-
3373,1,𣅝,[~/-],,,,,「突」之音,「𠬛」(明一魂入)之訛字,無效小韻
32-
3389,4,紇,~,絲下也又孔子父名又虜複姓三氏北齊開府紇奚永樂又有紇干氏紇骨氏又虜三字姓後魏有賊師紇豆陵伊利又胡結切,絲下也又孔子父名又虜複姓三氏北齊開府紇奚永樂又有紇[干/于]氏紇骨氏又虜三字姓後魏有賊師紇豆陵伊利又胡結切,,,,周祖謨校;poem表承「宋本廣韻データ」誤錄
33-
3390,1,搰,~,掘地也戶骨切于十,掘地也{戶骨}切[十一/十],,,,周祖謨校;poem表承「宋本廣韻データ」誤錄
31+
3373,1,𣅝,{~},,,,,「突」之音,「𠬛」(明一魂入)之訛字,無效小韻
32+
3389,4,紇,~,絲下也又孔子父名又虜複姓三氏北齊開府紇奚永樂又有紇干氏紇骨氏又虜三字姓後魏有賊師紇豆陵伊利又胡結切,絲下也又孔子父名又虜複姓三氏北齊開府紇奚永樂又有紇干〈于〉氏紇骨氏又虜三字姓後魏有賊師紇豆陵伊利又胡結切,,,,周祖謨校;poem表承「宋本廣韻データ」誤錄
33+
3390,1,搰,~,掘地也戶骨切于十,掘地也`戶骨`切十{一},,,,周祖謨校;poem表承「宋本廣韻データ」誤錄
3434
3390,4,𦗣,~,耳黧,耳聲,,,,周祖謨校;《形聲考》云此字當為戶骨⦅兀⦆<瓦>切為「𦖍」(下瓦反)字之訛
35-
3390,5,𦖼,[~/-],,,,,,周祖謨校
36-
3829,5,𣄉,[~/𣃳],掩光名掩也,[掩光名/掩也],,,,周祖謨校;《形聲考》云蓋為「掩」之變
37-
3829,6,厭,~,厭伏亦惡夢又於琰切六,厭伏亦惡夢又於琰切[六/-],,,,周祖謨校;poem表誤錄
38-
3830,6,貼,~,以物之質錢,以物[之/-]質錢,,,,周祖謨校;poem表誤錄
35+
3390,5,𦖼,{~},,,,,,周祖謨校
36+
3829,5,𣄉,~〈𣃳〉,掩光名掩也,掩{光名}[也],,,,周祖謨校;《形聲考》云蓋為「掩」之變
37+
3829,6,厭,~,厭伏亦惡夢又於琰切六,厭伏亦惡夢又於琰切{六},,,,周祖謨校;poem表誤錄
38+
3830,6,貼,~,以物之質錢,以物{之}質錢,,,,周祖謨校;poem表誤錄

0 commit comments

Comments
 (0)