Python将doc保存成docx和txt-技术-朕弟分享

Python将doc保存成docx和txt Python

admin 发布于 2022-10-17

代码

# 读取docx中的文本代码示例
import docx
import os
import win32com.client as wc
import time

# 遍历文件夹
def walkFile(file, file_type=""):
    docx_list = []
    doc_list = []
    for root, dirs, files in os.walk(file):
        # root 表示当前正在访问的文件夹路径
        # dirs 表示该文件夹下的子目录名list
        # files 表示该文件夹下的文件list

        # 遍历文件
        for f in files:
            pathtem = os.path.join(root, f)
            # 只需要后缀为.doc的文件
            if ".doc" in str(pathtem):
                if ".docx" in str(pathtem):
                    docx_list.append(pathtem)
                else:
                    doc_list.append(pathtem)
        # 遍历所有的文件夹
        # for d in dirs:
        #     print(os.path.join(root, d))
    if file_type == "docx":
        return docx_list
    elif file_type == "doc":
        return doc_list
    else:
        return doc_list + docx_list


# 将doc文件保存为docx文件
def DocToDocx(file_doc, file_docx):
    word = wc.Dispatch("Word.Application")
    word.Visible = 1        # 前台运行,显示
    doc = word.Documents.Open(file_doc)
    # 上面的地方只能使用完整绝对地址，相对地址找不到文件，且，只能用“\\”，不能用“/”，哪怕加了 r 也不行，涉及到将反斜杠看成转义字符。
    doc.SaveAs(file_docx, 12, False, "", True, "", False, False, False, False)
    doc.Close()
    word.Quit()


# 获取文档对象
def Open_Docx(file_docx):
    content = []
    try:
        file = docx.Document(file_docx)
        # print("段落数:" + str(len(file.paragraphs)))  # 段落数为13，每个回车隔离一段
        # 输出每一段的内容
        for para in file.paragraphs:
            if para.text != "\xa0":
                content.append(para.text)
            # print(para.text)
    except Exception as r:
        print('未知错误 %s' % (r))
    return content


# 将内容保存进txt
def write_txt(path, file):
    with open(path,"a",encoding="utf-8") as f:
        f.writelines([data + "\n" for data in file])

if __name__ == "__main__":
    path = "D:\\BaiduNetdiskDownload\\文案系列大全"
    # file_doc = walkFile(path, "doc")
    # for doc in file_doc:
    #     print(doc + " 正在保存成docx")
    #     DocToDocx(doc.replace("\\",r"\\"), doc.replace("\\",r"\\") + "x")
    #     time.sleep(3)
    # file_docx = walkFile(path, "docx")
    # for fdocx in file_docx:
    #     print(fdocx + " 正在保存成txt")
    #     file = Open_Docx(fdocx)
    #     if file:
    #         write_txt(fdocx.replace("docx", "txt"), file)
    file_docx = walkFile(path, "docx")
    flen = len(file_docx)
    for fdocx in file_docx:
        print(fdocx + " 正在保存成txt")
        file = Open_Docx(fdocx)
        if file:
            write_txt("D:\BaiduNetdiskDownload/文案系列大全/txt/" + str(flen) + "--" + fdocx.split("\\")[-1].replace("docx", "txt"), file)
        flen -= 1

最后于 2022-10-17 被admin编辑，原因：