|
- import os
- from docx import Document
- import win32com.client # 仅用于处理.doc文件(需Windows环境)
- import pythoncom
- import glob
- def docx_to_txt(docx_path, txt_path):
- """将.docx文件转换为.txt文件"""
- try:
- doc = Document(docx_path)
- text = '\n'.join([para.text for para in doc.paragraphs])
- with open(txt_path, 'w', encoding='utf-8') as f:
- f.write(text)
- return True
- except Exception as e:
- print(f"处理 {docx_path} 时出错: {str(e)}")
- return False
- def doc_to_txt(doc_path, txt_path):
- """将.doc文件转换为.txt文件(需安装pywin32且只能在Windows运行)"""
- try:
- pythoncom.CoInitialize() # 初始化COM库
- word = win32com.client.Dispatch("Word.Application")
- doc = word.Documents.Open(doc_path)
- doc.SaveAs(txt_path, FileFormat=2) # FileFormat=2表示保存为txt
- doc.Close()
- word.Quit()
- pythoncom.CoUninitialize()
- return True
- except Exception as e:
- print(f"处理 {doc_path} 时出错: {str(e)}")
- return False
- def convert_folder(input_folder, output_folder):
- """转换指定文件夹内的所有Word文档"""
- # 确保输出目录存在
- os.makedirs(output_folder, exist_ok=True)
- # 处理.docx文件
- for docx_file in glob.glob(os.path.join(input_folder, "*.docx")):
- base_name = os.path.basename(docx_file)[:-5]
- txt_file = os.path.join(output_folder, f"{base_name}.txt")
- if docx_to_txt(docx_file, txt_file):
- print(f"转换成功: {docx_file} -> {txt_file}")
- # 处理.doc文件(仅在Windows环境生效)
- if os.name == 'nt':
- for doc_file in glob.glob(os.path.join(input_folder, "*.doc")):
- base_name = os.path.basename(doc_file)[:-4]
- txt_file = os.path.join(output_folder, f"{base_name}.txt")
- if doc_to_txt(doc_file, txt_file):
- print(f"转换成功: {doc_file} -> {txt_file}")
- if __name__ == "__main__":
- input_dir = r"C:\Users\Administrator\Documents\001" # 输入文件夹路径(存放Word文档)
- output_dir = "output_txt" # 输出文件夹路径(存放TXT文件)
- convert_folder(input_dir, output_dir)
- # 如果存在不支持的.doc文件且不在Windows环境,给出提示
- if glob.glob(os.path.join(input_dir, "*.doc")) and os.name != 'nt':
- print("\n发现.doc文件,请注意:")
- print("1. .doc转换需要Windows操作系统并安装Microsoft Word")
- print("2. 需要安装pywin32库:pip install pywin32")
- print("3. 非Windows用户建议手动将.doc文件另存为.docx格式")
复制代码
需要安装两个库python-docx pywin32
|
|