Convert doc, docx, pdf, image, xlsx

Convert doc, docx, pdf, image, xlsx

1. Convert doc, docx to pdf

import os
import comtypes.client

wdFormatPDF = 17

in_file = os.path.abspath('file.doc') # Исходный файл
out_file = os.path.abspath('file.pdf') # Новый файл

word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()

2. Convert doc to docx

from win32com import client as wc
import os

w = wc.Dispatch('Word.Application')
file = os.path.join('C:\\Users\\Anna\\PycharmProjects\\pythonProject\\test.doc') # Исходный файл
doc = w.Documents.Open(file)
doc.SaveAs(file+"x", 16) # Новый файл = тот же + "х" в конце к расширению файла
doc.Close()
w.Quit()

3. Convert docx to pdf

from docx2pdf import convert

docx_file = 'input.docx'
pdf_file = 'output.pdf'

convert(docx_file, pdf_file)

4. Convert pdf to docx

from pdf2docx import parse

pdf_file = 'sample.pdf'
docx_file = 'sample.docx'

parse(pdf_file, docx_file)

5. Convert pdf to image

import fitz
import os

file_path = "example.pdf" # исходный файл
doc = fitz.open(file_path)  # открываем документ
for i, page in enumerate(doc):
    if not os.path.exists("images"): # если папки нет
        os.mkdir("images") # создаем папку, в которую сохраним картинки
    pix = page.get_pixmap(dpi=200)  # преобразуем страницу в изображение
    pix.save(f"images/page_{i}.png") # сохраняем в папку

6. Convert image to pdf

from PIL import Image
from glob import glob
import os

files = glob("images/*.png") # открываем картинки из папки images
iml = []
print(f"{files=}")
files = sorted(files)
for img in files:
    imgs = Image.open(img)
    rgb_im = imgs.convert('RGB') # во избежание ошибок: не удается сохранить режим RGBA
    iml.append(rgb_im)
pdf = "output.pdf"
print(iml)
image = iml[0]
iml.pop(0)
image.save(pdf, "PDF" , resolution=100.0, save_all=True, append_images=iml)
os.system(pdf)

7. Convert xlsx to pdf

from win32com import client
import os

file_location = 'C:\\Users\\Anna\\PycharmProjects\\pythonProject\\test.xlsx'

app = client.DispatchEx("Excel.Application")
app.Interactive = False
app.Visible = False

workbook = app.Workbooks.open(file_location)
output = os.path.splitext(file_location)[0]

workbook.ActiveSheet.ExportAsFixedFormat(0, output)
workbook.Close()

8. Ставим защиту от редактирования pdf - только чтение - read only

from hashlib import md5

from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.generic import NameObject, DictionaryObject, ArrayObject, \
    NumberObject, ByteStringObject
from PyPDF4.pdf import _alg33, _alg34, _alg35
from PyPDF4.utils import b_


def encrypt(writer_obj: PdfFileWriter, user_pwd, owner_pwd=None, use_128bit=True):
    """
    Encrypt this PDF file with the PDF Standard encryption handler.

    :param str user_pwd: The "user password", which allows for opening
        and reading the PDF file with the restrictions provided.
    :param str owner_pwd: The "owner password", which allows for
        opening the PDF files without any restrictions.  By default,
        the owner password is the same as the user password.
    :param bool use_128bit: flag as to whether to use 128bit
        encryption.  When false, 40bit encryption will be used.  By default,
        this flag is on.
    """
    import time, random
    if owner_pwd == None:
        owner_pwd = user_pwd
    if use_128bit:
        V = 2
        rev = 3
        keylen = int(128 / 8)
    else:
        V = 1
        rev = 2
        keylen = int(40 / 8)
    # permit copy and printing only:
    P = -44
    O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
    ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
    ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
    writer_obj._ID = ArrayObject((ID_1, ID_2))
    if rev == 2:
        U, key = _alg34(user_pwd, O, P, ID_1)
    else:
        assert rev == 3
        U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
    encrypt = DictionaryObject()
    encrypt[NameObject("/Filter")] = NameObject("/Standard")
    encrypt[NameObject("/V")] = NumberObject(V)
    if V == 2:
        encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
    encrypt[NameObject("/R")] = NumberObject(rev)
    encrypt[NameObject("/O")] = ByteStringObject(O)
    encrypt[NameObject("/U")] = ByteStringObject(U)
    encrypt[NameObject("/P")] = NumberObject(P)
    writer_obj._encrypt = writer_obj._addObject(encrypt)
    writer_obj._encrypt_key = key


unmeta = PdfFileReader('Input.pdf')

writer = PdfFileWriter()
writer.appendPagesFromReader(unmeta)
encrypt(writer, '', '123')

with open('Output.pdf', 'wb') as fp:
    writer.write(fp)

Исходный код подсмотрел на этом сайте.

7. Пишем программу для получения отсканированного pdf из word или pdf

a) Создадим форму. В нее пользователь будет перетаскивать файлы, которые нужно конвертировать.

from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget


class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle('Drag & Drop')

        # Даем разрешение на Drop
        self.setAcceptDrops(True)

        self.list_files = QListWidget()
        self.label_total_files = QLabel()

        main_layout = QVBoxLayout()
        main_layout.addWidget(QLabel('Drag and drop the file:'))
        main_layout.addWidget(self.list_files)
        main_layout.addWidget(self.label_total_files)

        central_widget = QWidget()
        central_widget.setLayout(main_layout)

        self.setCentralWidget(central_widget)

        self._update_states()

    def _update_states(self):
        self.label_total_files.setText('Files: {}'.format(self.list_files.count()))

    def dragEnterEvent(self, event):
        # Тут выполняются проверки и дается (или нет) разрешение на Drop

        mime = event.mimeData()

        # Если перемещаются ссылки
        if mime.hasUrls():
            # Разрешаем
            event.acceptProposedAction()

    def dropEvent(self, event):
        # Обработка события Drop

        for url in event.mimeData().urls():
            file_name = url.toLocalFile()
            self.list_files.addItem(file_name)

        self._update_states()

        return super().dropEvent(event)


if __name__ == '__main__':
    app = QApplication([])

    mw = MainWindow()
    mw.resize(500, 350)
    mw.show()

    app.exec()

Исходный код подсмотрел на этом сайте.

b) Добавим в эту форму код для конвертирования с сохранением результатов в папку исходника.

from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget, QLineEdit

import fitz

from PIL import Image
from glob import glob

import pathlib
import shutil

import docx2pdf
import os
import comtypes.client

# Protection
from hashlib import md5

from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.generic import NameObject, DictionaryObject, ArrayObject, \
    NumberObject, ByteStringObject
from PyPDF4.pdf import _alg33, _alg34, _alg35
from PyPDF4.utils import b_

class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle('ScanWord 2.2')

        # TextBox
        self.textbox = QLineEdit(self)
        self.textbox.move(301, 5)
        self.textbox.resize(190, 20)
        self.textbox.setText("enlib.ru")

        # Даем разрешение на Drop
        self.setAcceptDrops(True)

        self.list_files = QListWidget()
        self.label_total_files = QLabel()

        main_layout = QVBoxLayout()
        main_layout.addWidget(QLabel('Перетащите файлы в окно ниже                           Пароль:'))
        main_layout.addWidget(self.list_files)
        main_layout.addWidget(self.label_total_files)

        central_widget = QWidget()
        central_widget.setLayout(main_layout)

        self.setCentralWidget(central_widget)

        self._update_states()

    def _update_states(self):
        self.label_total_files.setText('Files: {}'.format(self.list_files.count()))

    def dragEnterEvent(self, event):
        # Тут выполняются проверки и дается (или нет) разрешение на Drop

        mime = event.mimeData()

        # Если перемещаются ссылки
        if mime.hasUrls():
            # Разрешаем
            event.acceptProposedAction()

    def dropEvent(self, event):
        # Обработка события Drop

        for url in event.mimeData().urls():
            file_name = url.toLocalFile()
            self.list_files.addItem(file_name)

            # convert
            file = file_name
            way = str(pathlib.Path(file).parent)
            name = str(pathlib.Path(file).stem)
            suf = str(pathlib.Path(file).suffix)
            directory = way + "/images"
            if not os.path.exists(directory):
                os.mkdir(directory)

            file_pdf = way + "/" + name + "_1" + ".pdf"

            # word -> pdf
            if suf == '.docx':
                docx2pdf.convert(file, file_pdf)
            elif suf == '.doc':
                wdFormatPDF = 17

                in_file = os.path.abspath(file)
                out_file = os.path.abspath(file_pdf)

                word = comtypes.client.CreateObject('Word.Application')
                doc = word.Documents.Open(in_file)
                doc.SaveAs(out_file, FileFormat=wdFormatPDF)
                doc.Close()
                word.Quit()
            elif suf == '.pdf':
                shutil.copy(file, file_pdf)

            # pdf -> images
            doc = fitz.open(file_pdf)  # open document
            for i, page in enumerate(doc):
                pix = page.get_pixmap(dpi=200)  # render page to an image
                pix.save(directory + "/" + name + f"_{i}.png")

            # images -> pdf
            files = glob(directory + "/" + name + "_*.png")
            iml = []
            print(f"{files=}")
            files = sorted(files)
            for img in files:
                imgs = Image.open(img)
                rgb_im = imgs.convert('RGB')  # to prevent errors: cannot save mode RGBA
                iml.append(rgb_im)
            print(iml)
            image = iml[0]
            iml.pop(0)
            image.save(file_pdf, "PDF", resolution=100.0, save_all=True, append_images=iml)

            # Protection
            def encrypt(writer_obj: PdfFileWriter, user_pwd, owner_pwd=None, use_128bit=True):
                """
                Encrypt this PDF file with the PDF Standard encryption handler.

                :param str user_pwd: The "user password", which allows for opening
                    and reading the PDF file with the restrictions provided.
                :param str owner_pwd: The "owner password", which allows for
                    opening the PDF files without any restrictions.  By default,
                    the owner password is the same as the user password.
                :param bool use_128bit: flag as to whether to use 128bit
                    encryption.  When false, 40bit encryption will be used.  By default,
                    this flag is on.
                """
                import time, random
                if owner_pwd == None:
                    owner_pwd = user_pwd
                if use_128bit:
                    V = 2
                    rev = 3
                    keylen = int(128 / 8)
                else:
                    V = 1
                    rev = 2
                    keylen = int(40 / 8)
                # permit copy and printing only:
                P = -44
                O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
                ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
                ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
                writer_obj._ID = ArrayObject((ID_1, ID_2))
                if rev == 2:
                    U, key = _alg34(user_pwd, O, P, ID_1)
                else:
                    assert rev == 3
                    U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
                encrypt = DictionaryObject()
                encrypt[NameObject("/Filter")] = NameObject("/Standard")
                encrypt[NameObject("/V")] = NumberObject(V)
                if V == 2:
                    encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
                encrypt[NameObject("/R")] = NumberObject(rev)
                encrypt[NameObject("/O")] = ByteStringObject(O)
                encrypt[NameObject("/U")] = ByteStringObject(U)
                encrypt[NameObject("/P")] = NumberObject(P)
                writer_obj._encrypt = writer_obj._addObject(encrypt)
                writer_obj._encrypt_key = key

            unmeta = PdfFileReader(file_pdf)

            writer = PdfFileWriter()
            writer.appendPagesFromReader(unmeta)
            encrypt(writer, '', self.textbox.text())

            with open(file_pdf, 'wb') as fp:
                writer.write(fp)

            shutil.rmtree(directory + '/') # удаление папки

        self._update_states()


        return super().dropEvent(event)


if __name__ == '__main__':
    app = QApplication([])

    mw = MainWindow()
    mw.resize(500, 350)
    mw.setStyleSheet("QMainWindow {border-top: 1px solid black; background-color:  #f2f2eb;}")
    mw.show()

    app.exec()

Для создания конечной программы .exe использовал эту статью.