#author("2024-12-23T13:09:08+09:00","default:ogiwiki","ogiwiki") #author("2024-12-23T13:09:30+09:00","default:ogiwiki","ogiwiki") **青空文庫XHTMLファイルをライブラリを使って整形する [#i5eee962] ***パッケージのインストール [#td55405d] -lxml :XMLパーザ(parser)(XMLの構造を解釈してプログラムから利用しやすいようにする) // pip install lxml sudo apt install python3-lxml -Beautiful Soup :HTMLやXMLファイルからデータを抽出・解析する // pip install beautifulsoup4 sudo apt install python3-bs4 //--mecabをpythonから使うモジュール // pip install mecab *** [#mbcb5fdc] ***axhtml2xml.py [#r757012b] import re import sys import glob from bs4 import BeautifulSoup from xml.etree.ElementTree import Element, SubElement, tostring from xml.dom.minidom import parseString import json with open("gaiji_map.json", "r", encoding="utf-8") as f: GAIJI_MAP = json.load(f) def replace_ruby_tags(soup): """ルビタグを一時的に|漢字《読み》形式に置換""" for ruby in soup.find_all("ruby"): rb = ruby.find("rb").text if ruby.find("rb") else "" rt = ruby.find("rt").text if ruby.find("rt") else "" ruby.replace_with(f"|{rb}《{rt}》") return soup def process_body(soup): """テキストを文ごとに分割し、XMLツリーを構築""" root = Element("text") body = soup.body if not body: raise ValueError("No <body> tag found in the input file.") # 特定の<div>要素を抽出 bibliographical_info = None notation_notes = None for div in body.find_all("div"): if div.get("id") == "card": # <div id="card">を削除 div.decompose() elif div.get("class") and "bibliographical_information" in div["class"]: # <div class="bibliographical_information">の情報を保存 bibliographical_info = div.get_text(strip=True) div.decompose() elif div.get("class") and "notation_notes" in div["class"]: # <div class="notation_notes">の情報を保存 notation_notes = div.get_text(strip=True) div.decompose() # 残りの要素を処理 text_content = body.get_text("\n", strip=True) # 文ごとに分割 sentences = re.split(r"(?<=。|」)", text_content) current_speech = None for sentence in sentences: sentence = sentence.strip() if not sentence: continue if sentence.startswith("「"): # 発話の開始 if current_speech is None: current_speech = SubElement(root, "speech") s_element = SubElement(current_speech, "s") s_element.text = sentence if sentence.endswith("。」") or sentence.endswith("」"): # 発話の終了 current_speech = None elif current_speech is not None: # 発話の継続 s_element = SubElement(current_speech, "s") s_element.text = sentence if sentence.endswith("。」") or sentence.endswith("」"): # 発話の終了 current_speech = None else: # 通常の文 s_element = SubElement(root, "s") s_element.text = sentence # bibliographical_information と notation_notes を文書末尾に追加 if bibliographical_info: biblio_element = SubElement(root, "bibliographical_information") biblio_element.set("text", bibliographical_info) if notation_notes: notes_element = SubElement(root, "notation_notes") notes_element.set("text", notation_notes) return root def restore_ruby_tags(root): """一時的なルビ表記を正式な <r> タグ形式に戻す""" for element in root.iter(): if element.text: # 正規表現を使って一時置換を復元 element.text = re.sub(r"|(.+?)《(.+?)》", r'<r rt="\2">\1</r>', element.text) # ルビタグの復元後に余分な改行文字を削除 element.text = re.sub(r"\n+", "", element.text) def replace_gaiji_tags_simple(soup): """<img class="gaiji">タグを対応する文字に置換""" for img in soup.find_all("img", class_="gaiji"): src_path = img.get("src", "") # srcからJIS句点位置を抽出(例: "2-01-79") match = re.search(r"(\d{1,2}-\d{2}-\d{2})", src_path) if match: jis_code = match.group(1) # マッピングから対応する文字を取得 character = GAIJI_MAP.get(jis_code, "") if character: # imgタグを対応する文字に置き換え img.replace_with(character) return soup def fix_split_quotation_marks(xml_string): """不適切に分割された '。」' や '。)' を修正""" # 正規表現で `<s>` タグ内の分割された '。」' と '。)' を修正 fixed_xml = re.sub(r"</s>\s*<s>([。」)])</s>", r"\1</s>", xml_string) return fixed_xml def convert_file(input_file, output_file): """XHTMLファイルを独自XMLに変換""" with open(input_file, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") # タイトルを取得 title = soup.title.string.strip() if soup.title else "Unknown Title" # ファイル名にタイトルを追加 base_output_file = re.sub(r"\.x?html?$", "", input_file) output_file = f"{base_output_file}_{title}.xml" # <img>外字タグを文字に置換 soup = replace_gaiji_tags_simple(soup) # ルビタグを一時変換 soup = replace_ruby_tags(soup) # Body を処理して XML ツリーを作成 root = process_body(soup) # SampleID 属性を追加 root.attrib["sampleID"] = title # ルビタグを正式な形式に戻す restore_ruby_tags(root) # XMLを出力 xml_string = tostring(root, encoding="unicode") pretty_xml = parseString(xml_string).toprettyxml(indent=" ") # 実体参照を解消 pretty_xml = pretty_xml.replace("<", "<").replace(">", ">").replace(""", '"') # 不適切に分割された '。」' を修正 fixed_xml = fix_split_quotation_marks(pretty_xml) with open(output_file, "w", encoding="utf-8") as f: f.write(fixed_xml) if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python convert.py <input_xhtml_files>") sys.exit(1) file_pattern = sys.argv[1] files = glob.glob(file_pattern) if not files: print(f"No files matched the pattern: {file_pattern}") sys.exit(1) for input_file in files: output_file = re.sub(r"\.x?html?$", ".xml", input_file) try: convert_file(input_file, output_file) print(f"Converted file saved to {output_file}") except Exception as e: print(f"Error processing {input_file}: {e}")