代码
# 读取docx中的文本代码示例
import docx
import os
import win32com.client as wc
import time
# 遍历文件夹
def walkFile(file, file_type=""):
docx_list = []
doc_list = []
for root, dirs, files in os.walk(file):
# root 表示当前正在访问的文件夹路径
# dirs 表示该文件夹下的子目录名list
# files 表示该文件夹下的文件list
# 遍历文件
for f in files:
pathtem = os.path.join(root, f)
# 只需要后缀为.doc的文件
if ".doc" in str(pathtem):
if ".docx" in str(pathtem):
docx_list.append(pathtem)
else:
doc_list.append(pathtem)
# 遍历所有的文件夹
# for d in dirs:
# print(os.path.join(root, d))
if file_type == "docx":
return docx_list
elif file_type == "doc":
return doc_list
else:
return doc_list + docx_list
# 将doc文件保存为docx文件
def DocToDocx(file_doc, file_docx):
word = wc.Dispatch("Word.Application")
word.Visible = 1 # 前台运行,显示
doc = word.Documents.Open(file_doc)
# 上面的地方只能使用完整绝对地址,相对地址找不到文件,且,只能用“\\”,不能用“/”,哪怕加了 r 也不行,涉及到将反斜杠看成转义字符。
doc.SaveAs(file_docx, 12, False, "", True, "", False, False, False, False)
doc.Close()
word.Quit()
# 获取文档对象
def Open_Docx(file_docx):
content = []
try:
file = docx.Document(file_docx)
# print("段落数:" + str(len(file.paragraphs))) # 段落数为13,每个回车隔离一段
# 输出每一段的内容
for para in file.paragraphs:
if para.text != "\xa0":
content.append(para.text)
# print(para.text)
except Exception as r:
print('未知错误 %s' % (r))
return content
# 将内容保存进txt
def write_txt(path, file):
with open(path,"a",encoding="utf-8") as f:
f.writelines([data + "\n" for data in file])
if __name__ == "__main__":
path = "D:\\BaiduNetdiskDownload\\文案系列大全"
# file_doc = walkFile(path, "doc")
# for doc in file_doc:
# print(doc + " 正在保存成docx")
# DocToDocx(doc.replace("\\",r"\\"), doc.replace("\\",r"\\") + "x")
# time.sleep(3)
# file_docx = walkFile(path, "docx")
# for fdocx in file_docx:
# print(fdocx + " 正在保存成txt")
# file = Open_Docx(fdocx)
# if file:
# write_txt(fdocx.replace("docx", "txt"), file)
file_docx = walkFile(path, "docx")
flen = len(file_docx)
for fdocx in file_docx:
print(fdocx + " 正在保存成txt")
file = Open_Docx(fdocx)
if file:
write_txt("D:\BaiduNetdiskDownload/文案系列大全/txt/" + str(flen) + "--" + fdocx.split("\\")[-1].replace("docx", "txt"), file)
flen -= 1
最后于 2022-10-17
被admin编辑
,原因: