授業資料/PythonでXML
をテンプレートにして作成
開始行:
**青空文庫XHTMLファイルをライブラリを使って整形する [#i5e...
***パッケージのインストール [#td55405d]
-lxml :XMLパーザ(parser)(XMLの構造を解釈してプログラムか...
// pip install lxml
sudo apt install python3-lxml
-Beautiful Soup :HTMLやXMLファイルからデータを抽出・解析...
// pip install beautifulsoup4
sudo apt install python3-bs4
//--mecabをpythonから使うモジュール
// pip install mecab
***axhtml2xml.py [#r757012b]
import re
import sys
import glob
from bs4 import BeautifulSoup
from xml.etree.ElementTree import Element, SubElement, t...
from xml.dom.minidom import parseString
import json
with open("gaiji_map.json", "r", encoding="utf-8") as f:
GAIJI_MAP = json.load(f)
def replace_ruby_tags(soup):
"""ルビタグを一時的に|漢字《読み》形式に置換"""
for ruby in soup.find_all("ruby"):
rb = ruby.find("rb").text if ruby.find("rb") els...
rt = ruby.find("rt").text if ruby.find("rt") els...
ruby.replace_with(f"|{rb}《{rt}》")
return soup
def process_body(soup):
"""テキストを文ごとに分割し、XMLツリーを構築"""
root = Element("text")
body = soup.body
if not body:
raise ValueError("No <body> tag found in the inp...
# 特定の<div>要素を抽出
bibliographical_info = None
notation_notes = None
for div in body.find_all("div"):
if div.get("id") == "card":
# <div id="card">を削除
div.decompose()
elif div.get("class") and "bibliographical_infor...
# <div class="bibliographical_information">...
bibliographical_info = div.get_text(strip=Tr...
div.decompose()
elif div.get("class") and "notation_notes" in di...
# <div class="notation_notes">の情報を保存
notation_notes = div.get_text(strip=True)
div.decompose()
# 残りの要素を処理
text_content = body.get_text("\n", strip=True)
# 文ごとに分割
sentences = re.split(r"(?<=。|」)", text_content)
current_speech = None
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if sentence.startswith("「"):
# 発話の開始
if current_speech is None:
current_speech = SubElement(root, "speec...
s_element = SubElement(current_speech, "s")
s_element.text = sentence
if sentence.endswith("。」") or sentence.end...
# 発話の終了
current_speech = None
elif current_speech is not None:
# 発話の継続
s_element = SubElement(current_speech, "s")
s_element.text = sentence
if sentence.endswith("。」") or sentence.end...
# 発話の終了
current_speech = None
else:
# 通常の文
s_element = SubElement(root, "s")
s_element.text = sentence
# bibliographical_information と notation_notes を文...
if bibliographical_info:
biblio_element = SubElement(root, "bibliographic...
biblio_element.set("text", bibliographical_info)
if notation_notes:
notes_element = SubElement(root, "notation_notes")
notes_element.set("text", notation_notes)
return root
def restore_ruby_tags(root):
"""一時的なルビ表記を正式な <r> タグ形式に戻す"""
for element in root.iter():
if element.text:
# 正規表現を使って一時置換を復元
element.text = re.sub(r"|(.+?)《(.+?)》", r...
# ルビタグの復元後に余分な改行文字を削除
element.text = re.sub(r"\n+", "", element.te...
def replace_gaiji_tags_simple(soup):
"""<img class="gaiji">タグを対応する文字に置換"""
for img in soup.find_all("img", class_="gaiji"):
src_path = img.get("src", "")
# srcからJIS句点位置を抽出(例: "2-01-79")
match = re.search(r"(\d{1,2}-\d{2}-\d{2})", src_...
if match:
jis_code = match.group(1)
# マッピングから対応する文字を取得
character = GAIJI_MAP.get(jis_code, "")
if character:
# imgタグを対応する文字に置き換え
img.replace_with(character)
return soup
def fix_split_quotation_marks(xml_string):
"""不適切に分割された '。」' や '。)' を修正"""
# 正規表現で `<s>` タグ内の分割された '。」' と '。...
fixed_xml = re.sub(r"</s>\s*<s>([。」)])</s>", r"\1...
return fixed_xml
def convert_file(input_file, output_file):
"""XHTMLファイルを独自XMLに変換"""
with open(input_file, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# タイトルを取得
title = soup.title.string.strip() if soup.title else...
# ファイル名にタイトルを追加
base_output_file = re.sub(r"\.x?html?$", "", input_f...
output_file = f"{base_output_file}_{title}.xml"
# <img>外字タグを文字に置換
soup = replace_gaiji_tags_simple(soup)
# ルビタグを一時変換
soup = replace_ruby_tags(soup)
# Body を処理して XML ツリーを作成
root = process_body(soup)
# SampleID 属性を追加
root.attrib["sampleID"] = title
# ルビタグを正式な形式に戻す
restore_ruby_tags(root)
# XMLを出力
xml_string = tostring(root, encoding="unicode")
pretty_xml = parseString(xml_string).toprettyxml(ind...
# 実体参照を解消
pretty_xml = pretty_xml.replace("<", "<").replace...
# 不適切に分割された '。」' を修正
fixed_xml = fix_split_quotation_marks(pretty_xml)
with open(output_file, "w", encoding="utf-8") as f:
f.write(fixed_xml)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python convert.py <input_xhtml_fil...
sys.exit(1)
file_pattern = sys.argv[1]
files = glob.glob(file_pattern)
if not files:
print(f"No files matched the pattern: {file_patt...
sys.exit(1)
for input_file in files:
output_file = re.sub(r"\.x?html?$", ".xml", inpu...
try:
convert_file(input_file, output_file)
print(f"Converted file saved to {output_file...
except Exception as e:
print(f"Error processing {input_file}: {e}")
終了行:
**青空文庫XHTMLファイルをライブラリを使って整形する [#i5e...
***パッケージのインストール [#td55405d]
-lxml :XMLパーザ(parser)(XMLの構造を解釈してプログラムか...
// pip install lxml
sudo apt install python3-lxml
-Beautiful Soup :HTMLやXMLファイルからデータを抽出・解析...
// pip install beautifulsoup4
sudo apt install python3-bs4
//--mecabをpythonから使うモジュール
// pip install mecab
***axhtml2xml.py [#r757012b]
import re
import sys
import glob
from bs4 import BeautifulSoup
from xml.etree.ElementTree import Element, SubElement, t...
from xml.dom.minidom import parseString
import json
with open("gaiji_map.json", "r", encoding="utf-8") as f:
GAIJI_MAP = json.load(f)
def replace_ruby_tags(soup):
"""ルビタグを一時的に|漢字《読み》形式に置換"""
for ruby in soup.find_all("ruby"):
rb = ruby.find("rb").text if ruby.find("rb") els...
rt = ruby.find("rt").text if ruby.find("rt") els...
ruby.replace_with(f"|{rb}《{rt}》")
return soup
def process_body(soup):
"""テキストを文ごとに分割し、XMLツリーを構築"""
root = Element("text")
body = soup.body
if not body:
raise ValueError("No <body> tag found in the inp...
# 特定の<div>要素を抽出
bibliographical_info = None
notation_notes = None
for div in body.find_all("div"):
if div.get("id") == "card":
# <div id="card">を削除
div.decompose()
elif div.get("class") and "bibliographical_infor...
# <div class="bibliographical_information">...
bibliographical_info = div.get_text(strip=Tr...
div.decompose()
elif div.get("class") and "notation_notes" in di...
# <div class="notation_notes">の情報を保存
notation_notes = div.get_text(strip=True)
div.decompose()
# 残りの要素を処理
text_content = body.get_text("\n", strip=True)
# 文ごとに分割
sentences = re.split(r"(?<=。|」)", text_content)
current_speech = None
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if sentence.startswith("「"):
# 発話の開始
if current_speech is None:
current_speech = SubElement(root, "speec...
s_element = SubElement(current_speech, "s")
s_element.text = sentence
if sentence.endswith("。」") or sentence.end...
# 発話の終了
current_speech = None
elif current_speech is not None:
# 発話の継続
s_element = SubElement(current_speech, "s")
s_element.text = sentence
if sentence.endswith("。」") or sentence.end...
# 発話の終了
current_speech = None
else:
# 通常の文
s_element = SubElement(root, "s")
s_element.text = sentence
# bibliographical_information と notation_notes を文...
if bibliographical_info:
biblio_element = SubElement(root, "bibliographic...
biblio_element.set("text", bibliographical_info)
if notation_notes:
notes_element = SubElement(root, "notation_notes")
notes_element.set("text", notation_notes)
return root
def restore_ruby_tags(root):
"""一時的なルビ表記を正式な <r> タグ形式に戻す"""
for element in root.iter():
if element.text:
# 正規表現を使って一時置換を復元
element.text = re.sub(r"|(.+?)《(.+?)》", r...
# ルビタグの復元後に余分な改行文字を削除
element.text = re.sub(r"\n+", "", element.te...
def replace_gaiji_tags_simple(soup):
"""<img class="gaiji">タグを対応する文字に置換"""
for img in soup.find_all("img", class_="gaiji"):
src_path = img.get("src", "")
# srcからJIS句点位置を抽出(例: "2-01-79")
match = re.search(r"(\d{1,2}-\d{2}-\d{2})", src_...
if match:
jis_code = match.group(1)
# マッピングから対応する文字を取得
character = GAIJI_MAP.get(jis_code, "")
if character:
# imgタグを対応する文字に置き換え
img.replace_with(character)
return soup
def fix_split_quotation_marks(xml_string):
"""不適切に分割された '。」' や '。)' を修正"""
# 正規表現で `<s>` タグ内の分割された '。」' と '。...
fixed_xml = re.sub(r"</s>\s*<s>([。」)])</s>", r"\1...
return fixed_xml
def convert_file(input_file, output_file):
"""XHTMLファイルを独自XMLに変換"""
with open(input_file, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# タイトルを取得
title = soup.title.string.strip() if soup.title else...
# ファイル名にタイトルを追加
base_output_file = re.sub(r"\.x?html?$", "", input_f...
output_file = f"{base_output_file}_{title}.xml"
# <img>外字タグを文字に置換
soup = replace_gaiji_tags_simple(soup)
# ルビタグを一時変換
soup = replace_ruby_tags(soup)
# Body を処理して XML ツリーを作成
root = process_body(soup)
# SampleID 属性を追加
root.attrib["sampleID"] = title
# ルビタグを正式な形式に戻す
restore_ruby_tags(root)
# XMLを出力
xml_string = tostring(root, encoding="unicode")
pretty_xml = parseString(xml_string).toprettyxml(ind...
# 実体参照を解消
pretty_xml = pretty_xml.replace("<", "<").replace...
# 不適切に分割された '。」' を修正
fixed_xml = fix_split_quotation_marks(pretty_xml)
with open(output_file, "w", encoding="utf-8") as f:
f.write(fixed_xml)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python convert.py <input_xhtml_fil...
sys.exit(1)
file_pattern = sys.argv[1]
files = glob.glob(file_pattern)
if not files:
print(f"No files matched the pattern: {file_patt...
sys.exit(1)
for input_file in files:
output_file = re.sub(r"\.x?html?$", ".xml", inpu...
try:
convert_file(input_file, output_file)
print(f"Converted file saved to {output_file...
except Exception as e:
print(f"Error processing {input_file}: {e}")
ページ名: