import re import csv # 定义输入与输出路径 input_path = r"E:\HDC\test0829a_merged.txt" output_path = r"E:\HDC\test0829a_blocks_analysis3.csv" def get_quote_pairs(line): """获取所有引号对的位置,严格区分中英文引号""" pairs = [] stack = [] # 存放 (quote_char, index) for i, ch in enumerate(line): if ch == '“': stack.append(('“', i)) elif ch == '”': # 找到最近的中文开引号 for j in range(len(stack) - 1, -1, -1): if stack[j][0] == '“': start = stack.pop(j)[1] pairs.append((start, i)) break elif ch == '"': # ASCII 双引号,使用栈的 LIFO 行为 if stack and stack[-1][0] == '"': start = stack.pop()[1] pairs.append((start, i)) else: stack.append(('"', i)) return pairs def is_inside_quotes(start, end, quote_pairs): """检查位置是否在引号对内""" for q_start, q_end in quote_pairs: if start >= q_start and end <= q_end: return True return False def extract_blocks_from_line(line): """从单行词条中提取所有节块""" # 初始化各类节块列表 blocks = { '词头节块': [], '例证节块': [], '异写标示节块': [], '主条标示节块': [], '缩写标示节块': [], '异称标示节块': [], '简称标示节块': [], '同源标示节块': [], '详条标示节块': [], '他条标示节块': [], '异文标示节块': [], '义序标示节块': [], '读音标示节块': [], '畸零节块': [], '释文节块': [] } # 存储所有已识别节块的精确位置 all_spans = [] # 获取所有引号对位置 quote_pairs = get_quote_pairs(line) # 1. 提取词头节块(【...】)并记录位置 citou_matches = list(re.finditer(r'【[^】]+】', line)) for match in citou_matches: blocks['词头节块'].append(match.group()) all_spans.append(match.span()) # 2. 提取义序标示节块(三级,不在引号内)并记录位置 yixu_blocks = [] # 存储(位置, 内容)的元组 # 一级义序 yixu1_pattern = r'[㊀㊁㊂㊃㊄㊅㊆㊇㊈㊉]' yixu1_matches = list(re.finditer(yixu1_pattern, line)) for match in yixu1_matches: if not is_inside_quotes(match.start(), match.end(), quote_pairs): yixu_blocks.append((match.start(), match.group())) all_spans.append(match.span()) # 二级义序 yixu2_pattern = r'[❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]' yixu2_matches = list(re.finditer(yixu2_pattern, line)) for match in yixu2_matches: if not is_inside_quotes(match.start(), match.end(), quote_pairs): yixu_blocks.append((match.start(), match.group())) all_spans.append(match.span()) # 三级义序 yixu3_pattern = r'[((][1-9][0-9]*[))]' yixu3_matches = list(re.finditer(yixu3_pattern, line)) for match in yixu3_matches: if not is_inside_quotes(match.start(), match.end(), quote_pairs): yixu_blocks.append((match.start(), match.group())) all_spans.append(match.span()) # 按位置排序义序标示节块 yixu_blocks.sort(key=lambda x: x[0]) blocks['义序标示节块'] = [block[1] for block in yixu_blocks] # 3. 提取读音标示节块(包含拼音字母,不在引号内)并记录位置 # 精确匹配拼音中实际使用的带声调字母 duinyin_pattern = r'[((][^))]*[a-zA-Z\u00E0\u00E1\u01CE\u0101\u00E8\u00E9\u011B\u0113\u00EC\u00ED\u01D0\u012B\u00F2\u00F3\u01D2\u014D\u00F9\u00FA\u01D4\u016B\u01D6\u01D8\u01DA\u01DC\u00FC]+[^))]*[))]' duinyin_matches = list(re.finditer(duinyin_pattern, line)) for match in duinyin_matches: if not is_inside_quotes(match.start(), match.end(), quote_pairs): blocks['读音标示节块'].append(match.group()) all_spans.append(match.span()) # 4. 合并重叠的span all_spans.sort() merged_spans = [] for start, end in all_spans: if merged_spans and start <= merged_spans[-1][1]: merged_spans[-1] = (merged_spans[-1][0], max(merged_spans[-1][1], end)) else: merged_spans.append((start, end)) # 5. 使用独特分隔符替换已识别的节块 sep = '<<>>' remaining_line = line for start, end in reversed(merged_spans): remaining_line = remaining_line[:start] + sep + remaining_line[end:] # 按独特分隔符分割成独立片段 segments = re.split(r'(?:' + re.escape(sep) + r')+', remaining_line) segments = [s.strip() for s in segments if s.strip()] # 6. 处理每个独立片段 for segment in segments: # 6.1 提取例证节块 example_pattern = r'[^。]*?:["“][^"”]+["”]' example_matches = list(re.finditer(example_pattern, segment)) for match in example_matches: block = match.group().strip() if block: blocks['例证节块'].append(block) # 从片段中移除例证节块 temp_segment = segment for match in reversed(example_matches): start, end = match.span() temp_segment = temp_segment[:start] + ' ' * (end - start) + temp_segment[end:] # 6.2 按句号分割剩余部分 sub_segments = re.split(r'。', temp_segment) # 6.3 处理最后一个非空段(畸零节块) if sub_segments and sub_segments[-1].strip(): blocks['畸零节块'].append(sub_segments[-1].strip()) sub_segments = sub_segments[:-1] # 6.4 处理每个普通节块 for sub_segment in sub_segments: sub_segment = sub_segment.strip() if not sub_segment: continue # 检查各类标示节块(必须以指定关键词开头) if re.fullmatch(r'亦作["“][^"”]+["”](?:、["“][^"”]+["”])*', sub_segment): blocks['异写标示节块'].append(sub_segment + '。') elif re.fullmatch(r'见["“][^"”]+["”]', sub_segment): blocks['主条标示节块'].append(sub_segment + '。') elif re.fullmatch(r'亦省作["“][^"”]+["”](?:、["“][^"”]+["”])*', sub_segment): blocks['缩写标示节块'].append(sub_segment + '。') elif re.fullmatch(r'(?:亦称|也称)["“][^"”]+["”](?:、["“][^"”]+["”])*', sub_segment): blocks['异称标示节块'].append(sub_segment + '。') elif re.fullmatch(r'亦省称["“][^"”]+["”](?:、["“][^"”]+["”])*', sub_segment): blocks['简称标示节块'].append(sub_segment + '。') elif re.fullmatch(r'同["“][^"”]+["”]', sub_segment): blocks['同源标示节块'].append(sub_segment + '。') elif re.fullmatch(r'详["“][^"”]+["”](?:、["“][^"”]+["”])*', sub_segment): blocks['详条标示节块'].append(sub_segment + '。') elif re.fullmatch(r'参见["“][^"”]+["”](?:、["“][^"”]+["”])*', sub_segment): blocks['他条标示节块'].append(sub_segment + '。') elif re.fullmatch(r'.*一本作["“][^"”]+["”]', sub_segment): blocks['异文标示节块'].append(sub_segment + '。') else: # 普通解释节块(释文节块) blocks['释文节块'].append(sub_segment + '。') # 7. 合并同类节块 for block_type in blocks: blocks[block_type] = ' || '.join(blocks[block_type]) if blocks[block_type] else '' return blocks # 处理文件并生成CSV with open(input_path, "r", encoding="utf-8") as infile, \ open(output_path, "w", encoding="utf-8", newline="", errors='replace') as outfile: # 创建CSV写入器 writer = csv.writer(outfile) # 写入CSV头部 headers = ['原词条', '词头节块', '例证节块', '异写标示节块', '主条标示节块', '缩写标示节块', '异称标示节块', '简称标示节块', '同源标示节块', '详条标示节块', '他条标示节块', '异文标示节块', '义序标示节块', '读音标示节块', '畸零节块', '释文节块'] writer.writerow(headers) # 处理每一行 for line_num, line in enumerate(infile, 1): line = line.strip() if not line: continue try: # 提取所有节块 blocks = extract_blocks_from_line(line) # 准备CSV行数据 row_data = [line] + [blocks[header] for header in headers[1:]] # 写入CSV行 writer.writerow(row_data) except Exception as e: print(f"处理第 {line_num} 行时出错: {e}") # 写入错误行 error_row = [line] + [''] * (len(headers) - 1) writer.writerow(error_row) print("✅ 改进版节块提取完成,结果已保存至:", output_path) print("📋 提示:用WPS表格打开时请选择'UTF-8'编码")