Description
This script extracts unique words from an EPUB file that match a specific regex pattern and saves them to a CSV file. It performs the following steps:
-
Extract Text from EPUB:
- The script opens the EPUB file as a ZIP archive and iterates through its contents.
- It processes only
.htmland.xhtmlfiles, extracting text using BeautifulSoup. - A regex pattern is applied to identify words starting with “dé”/“de” or “ré”/“re”, capturing variations with accented characters.
- The extracted words are stored in a set to ensure uniqueness.
-
Save to CSV:
- The unique words are sorted alphabetically and written to a CSV file with a header.
-
Execution:
- The script runs with predefined file names (
your-file.epubas input andoutput.csvas output). - After execution, it prints the total count of unique words saved.
- The script runs with predefined file names (
This script is useful for linguistic analysis, specifically identifying words with certain prefixes in an EPUB file.
Code
import zipfile
import re
import csv
from bs4 import BeautifulSoup
# Define the correct regex pattern
regex_pattern = r"\b(?:d[ée]|r[ée])[a-zA-Zà-ÿ]+\b"
def extract_epub_text(epub_path):
"""Extracts text from all HTML/XHTML files inside the EPUB."""
unique_words = set() # Use a set to store unique words
with zipfile.ZipFile(epub_path, 'r') as epub:
for file_name in epub.namelist():
if file_name.endswith('.html') or file_name.endswith('.xhtml'):
with epub.open(file_name) as f:
soup = BeautifulSoup(f.read(), 'html.parser')
words = set(re.findall(regex_pattern, soup.get_text(), re.IGNORECASE))
unique_words.update(words) # Add words to the set to ensure uniqueness
return unique_words
def save_to_csv(words, output_file):
"""Saves unique words to a CSV file."""
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Word"]) # Header
for word in sorted(words): # Sort alphabetically for readability
writer.writerow([word])
# Input and output file names
epub_file = "your-file.epub"
output_csv = "output.csv"
# Extract words and save to CSV
unique_words = extract_epub_text(epub_file)
save_to_csv(unique_words, output_csv)
print(f"Extraction complete! {len(unique_words)} unique words saved to {output_csv}.")