授業資料/PythonでXML の変更点

追加された行はこの色です。
削除された行はこの色です。
授業資料/PythonでXML へ行く。
授業資料/PythonでXML の差分を削除
#author("2024-12-23T13:09:08+09:00","default:ogiwiki","ogiwiki")
#author("2024-12-23T13:09:30+09:00","default:ogiwiki","ogiwiki")
**青空文庫XHTMLファイルをライブラリを使って整形する [#i5eee962]

***パッケージのインストール [#td55405d]
-lxml :XMLパーザ(parser)（XMLの構造を解釈してプログラムから利用しやすいようにする）
// pip install lxml
 sudo apt install python3-lxml

-Beautiful Soup :HTMLやXMLファイルからデータを抽出・解析する
// pip install beautifulsoup4
 sudo apt install python3-bs4

//--mecabをpythonから使うモジュール
// pip install mecab


*** [#mbcb5fdc]
***axhtml2xml.py [#r757012b]
 import re
 import sys
 import glob
 from bs4 import BeautifulSoup
 from xml.etree.ElementTree import Element, SubElement, tostring
 from xml.dom.minidom import parseString
 
 import json
 
 with open("gaiji_map.json", "r", encoding="utf-8") as f:
     GAIJI_MAP = json.load(f)
 
 def replace_ruby_tags(soup):
     """ルビタグを一時的に｜漢字《読み》形式に置換"""
     for ruby in soup.find_all("ruby"):
         rb = ruby.find("rb").text if ruby.find("rb") else ""
         rt = ruby.find("rt").text if ruby.find("rt") else ""
         ruby.replace_with(f"｜{rb}《{rt}》")
     return soup
 
 
 def process_body(soup):
     """テキストを文ごとに分割し、XMLツリーを構築"""
     root = Element("text")
     body = soup.body
     if not body:
         raise ValueError("No <body> tag found in the input file.")
 
     # 特定の<div>要素を抽出
     bibliographical_info = None
     notation_notes = None
 
     for div in body.find_all("div"):
         if div.get("id") == "card":
             # <div id="card">を削除
             div.decompose()
         elif div.get("class") and "bibliographical_information" in div["class"]:
             # <div class="bibliographical_information">の情報を保存
             bibliographical_info = div.get_text(strip=True)
             div.decompose()
         elif div.get("class") and "notation_notes" in div["class"]:
             # <div class="notation_notes">の情報を保存
             notation_notes = div.get_text(strip=True)
             div.decompose()
 
     # 残りの要素を処理
     text_content = body.get_text("\n", strip=True)
 
     # 文ごとに分割
     sentences = re.split(r"(?<=。|」)", text_content)
     current_speech = None
 
     for sentence in sentences:
         sentence = sentence.strip()
         if not sentence:
             continue
 
         if sentence.startswith("「"):
             # 発話の開始
             if current_speech is None:
                 current_speech = SubElement(root, "speech")
 
             s_element = SubElement(current_speech, "s")
             s_element.text = sentence
             if sentence.endswith("。」") or sentence.endswith("」"):
                 # 発話の終了
                 current_speech = None
         elif current_speech is not None:
             # 発話の継続
             s_element = SubElement(current_speech, "s")
             s_element.text = sentence
             if sentence.endswith("。」") or sentence.endswith("」"):
                 # 発話の終了
                 current_speech = None
         else:
             # 通常の文
             s_element = SubElement(root, "s")
             s_element.text = sentence
 
     # bibliographical_information と notation_notes を文書末尾に追加
     if bibliographical_info:
         biblio_element = SubElement(root, "bibliographical_information")
         biblio_element.set("text", bibliographical_info)
 
     if notation_notes:
         notes_element = SubElement(root, "notation_notes")
         notes_element.set("text", notation_notes)
 
     return root
 
 
 def restore_ruby_tags(root):
     """一時的なルビ表記を正式な <r> タグ形式に戻す"""
     for element in root.iter():
         if element.text:
             # 正規表現を使って一時置換を復元
             element.text = re.sub(r"｜(.+?)《(.+?)》", r'<r rt="\2">\1</r>', element.text)
             # ルビタグの復元後に余分な改行文字を削除
             element.text = re.sub(r"\n+", "", element.text)
 
 
 
 def replace_gaiji_tags_simple(soup):
     """<img class="gaiji">タグを対応する文字に置換"""
     for img in soup.find_all("img", class_="gaiji"):
         src_path = img.get("src", "")
         # srcからJIS句点位置を抽出（例: "2-01-79"）
         match = re.search(r"(\d{1,2}-\d{2}-\d{2})", src_path)
         if match:
             jis_code = match.group(1)
             # マッピングから対応する文字を取得
             character = GAIJI_MAP.get(jis_code, "")
             if character:
                 # imgタグを対応する文字に置き換え
                 img.replace_with(character)
     return soup
 
 
 def fix_split_quotation_marks(xml_string):
     """不適切に分割された '。」' や '。）' を修正"""
     # 正規表現で `<s>` タグ内の分割された '。」' と '。）' を修正
     fixed_xml = re.sub(r"</s>\s*<s>([。」）])</s>", r"\1</s>", xml_string)
     return fixed_xml
 
 
 def convert_file(input_file, output_file):
     """XHTMLファイルを独自XMLに変換"""
     with open(input_file, "r", encoding="utf-8") as f:
         soup = BeautifulSoup(f, "html.parser")
 
     # タイトルを取得
     title = soup.title.string.strip() if soup.title else "Unknown Title"
 
     # ファイル名にタイトルを追加
     base_output_file = re.sub(r"\.x?html?$", "", input_file)
     output_file = f"{base_output_file}_{title}.xml"
     
     # <img>外字タグを文字に置換
     soup = replace_gaiji_tags_simple(soup)
     
     # ルビタグを一時変換
     soup = replace_ruby_tags(soup)
 
     # Body を処理して XML ツリーを作成
     root = process_body(soup)
 
     # SampleID 属性を追加
     root.attrib["sampleID"] = title
 
     # ルビタグを正式な形式に戻す
     restore_ruby_tags(root)
 
     # XMLを出力
     xml_string = tostring(root, encoding="unicode")
     pretty_xml = parseString(xml_string).toprettyxml(indent="  ")
 
     # 実体参照を解消
     pretty_xml = pretty_xml.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')
 
     # 不適切に分割された '。」' を修正
     fixed_xml = fix_split_quotation_marks(pretty_xml)
 
     with open(output_file, "w", encoding="utf-8") as f:
         f.write(fixed_xml)
 
 
 if __name__ == "__main__":
     if len(sys.argv) < 2:
         print("Usage: python convert.py <input_xhtml_files>")
         sys.exit(1)
 
     file_pattern = sys.argv[1]
     files = glob.glob(file_pattern)
 
     if not files:
         print(f"No files matched the pattern: {file_pattern}")
         sys.exit(1)
 
     for input_file in files:
         output_file = re.sub(r"\.x?html?$", ".xml", input_file)
         try:
             convert_file(input_file, output_file)
             print(f"Converted file saved to {output_file}")
         except Exception as e:
             print(f"Error processing {input_file}: {e}")