Convert doc, docx, pdf, image
×

Convert doc, docx, pdf, image

1. Convert doc, docx to pdf (возможно, программа конвертирует и другие форматы в pdf)

import os
import comtypes.client

wdFormatPDF = 17

in_file = os.path.abspath('file.doc') # Исходный файл
out_file = os.path.abspath('file.pdf') # Новый файл

word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()

Исходный код подсмотрел на этом сайте.

2. Convert docx to pdf

from docx2pdf import convert

docx_file = 'input.docx'
pdf_file = 'output.pdf'

convert(docx_file, pdf_file)

3. Convert pdf to docx

from pdf2docx import parse

pdf_file = 'sample.pdf'
docx_file = 'sample.docx'

parse(pdf_file, docx_file)

Исходный код подсмотрел на этом сайте.

4. Convert pdf to image

import fitz
import os

file_path = "example.pdf" # исходный файл
doc = fitz.open(file_path)  # открываем документ
for i, page in enumerate(doc):
    if not os.path.exists("images"): # если папки нет
        os.mkdir("images") # создаем папку, в которую сохраним картинки
    pix = page.get_pixmap(dpi=200)  # преобразуем страницу в изображение
    pix.save(f"images/page_{i}.png") # сохраняем в папку

5. Convert image to pdf

from PIL import Image
from glob import glob
import os

files = glob("images/*.png") # открываем картинки из папки images
iml = []
print(f"{files=}")
files = sorted(files)
for img in files:
    imgs = Image.open(img)
    rgb_im = imgs.convert('RGB') # во избежание ошибок: не удается сохранить режим RGBA
    iml.append(rgb_im)
pdf = "output.pdf"
print(iml)
image = iml[0]
iml.pop(0)
image.save(pdf, "PDF" , resolution=100.0, save_all=True, append_images=iml)
os.system(pdf)

6. Ставим защиту от редактирования pdf - только чтение - read only

from hashlib import md5

from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.generic import NameObject, DictionaryObject, ArrayObject, \
    NumberObject, ByteStringObject
from PyPDF4.pdf import _alg33, _alg34, _alg35
from PyPDF4.utils import b_


def encrypt(writer_obj: PdfFileWriter, user_pwd, owner_pwd=None, use_128bit=True):
    """
    Encrypt this PDF file with the PDF Standard encryption handler.

    :param str user_pwd: The "user password", which allows for opening
        and reading the PDF file with the restrictions provided.
    :param str owner_pwd: The "owner password", which allows for
        opening the PDF files without any restrictions.  By default,
        the owner password is the same as the user password.
    :param bool use_128bit: flag as to whether to use 128bit
        encryption.  When false, 40bit encryption will be used.  By default,
        this flag is on.
    """
    import time, random
    if owner_pwd == None:
        owner_pwd = user_pwd
    if use_128bit:
        V = 2
        rev = 3
        keylen = int(128 / 8)
    else:
        V = 1
        rev = 2
        keylen = int(40 / 8)
    # permit copy and printing only:
    P = -44
    O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
    ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
    ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
    writer_obj._ID = ArrayObject((ID_1, ID_2))
    if rev == 2:
        U, key = _alg34(user_pwd, O, P, ID_1)
    else:
        assert rev == 3
        U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
    encrypt = DictionaryObject()
    encrypt[NameObject("/Filter")] = NameObject("/Standard")
    encrypt[NameObject("/V")] = NumberObject(V)
    if V == 2:
        encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
    encrypt[NameObject("/R")] = NumberObject(rev)
    encrypt[NameObject("/O")] = ByteStringObject(O)
    encrypt[NameObject("/U")] = ByteStringObject(U)
    encrypt[NameObject("/P")] = NumberObject(P)
    writer_obj._encrypt = writer_obj._addObject(encrypt)
    writer_obj._encrypt_key = key


unmeta = PdfFileReader('Input.pdf')

writer = PdfFileWriter()
writer.appendPagesFromReader(unmeta)
encrypt(writer, '', '123')

with open('Output', 'wb') as fp:
    writer.write(fp)

Исходный код подсмотрел на этом сайте.

 

7. Пишем программу для получения отсканированного pdf из word

a) Создадим форму. В нее пользователь будет перетаскивать файлы, которые нужно конвертировать.

from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget


class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle('Drag & Drop')

        # Даем разрешение на Drop
        self.setAcceptDrops(True)

        self.list_files = QListWidget()
        self.label_total_files = QLabel()

        main_layout = QVBoxLayout()
        main_layout.addWidget(QLabel('Drag and drop the file:'))
        main_layout.addWidget(self.list_files)
        main_layout.addWidget(self.label_total_files)

        central_widget = QWidget()
        central_widget.setLayout(main_layout)

        self.setCentralWidget(central_widget)

        self._update_states()

    def _update_states(self):
        self.label_total_files.setText('Files: {}'.format(self.list_files.count()))

    def dragEnterEvent(self, event):
        # Тут выполняются проверки и дается (или нет) разрешение на Drop

        mime = event.mimeData()

        # Если перемещаются ссылки
        if mime.hasUrls():
            # Разрешаем
            event.acceptProposedAction()

    def dropEvent(self, event):
        # Обработка события Drop

        for url in event.mimeData().urls():
            file_name = url.toLocalFile()
            self.list_files.addItem(file_name)

        self._update_states()

        return super().dropEvent(event)


if __name__ == '__main__':
    app = QApplication([])

    mw = MainWindow()
    mw.resize(500, 350)
    mw.show()

    app.exec()

Исходный код подсмотрел на этом сайте.

b) Добавим в эту форму код для конвертирования с сохранением результатов в папку исходника.


from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget

import comtypes.client

import fitz

from PIL import Image
from glob import glob
import os

import pathlib

class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle('Drag & Drop')

        # Даем разрешение на Drop
        self.setAcceptDrops(True)

        self.list_files = QListWidget()
        self.label_total_files = QLabel()

        main_layout = QVBoxLayout()
        main_layout.addWidget(QLabel('Drag and drop the file:'))
        main_layout.addWidget(self.list_files)
        main_layout.addWidget(self.label_total_files)

        central_widget = QWidget()
        central_widget.setLayout(main_layout)

        self.setCentralWidget(central_widget)

        self._update_states()

    def _update_states(self):
        self.label_total_files.setText('Files: {}'.format(self.list_files.count()))

    def dragEnterEvent(self, event):
        # Тут выполняются проверки и дается (или нет) разрешение на Drop

        mime = event.mimeData()

        # Если перемещаются ссылки
        if mime.hasUrls():
            # Разрешаем
            event.acceptProposedAction()

    def dropEvent(self, event):
        # Обработка события Drop

        for url in event.mimeData().urls():
            file_name = url.toLocalFile()
            self.list_files.addItem(file_name)

            # Работа с файлами
            fpath = pathlib.Path(file_name)
            way = os.path.dirname(file_name)

            docx_file = file_name
            pdf_file = way + "/" + fpath.stem + ".pdf"
            images_directory = way + "/images"
            a = False

            # проверка расширения
            if os.path.splitext(docx_file)[1] != '.pdf':
                # doc or docx to pdf
                wdFormatPDF = 17

                in_file = os.path.abspath(docx_file)
                out_file = os.path.abspath(pdf_file)

                word = comtypes.client.CreateObject('Word.Application')
                doc = word.Documents.Open(in_file)
                doc.SaveAs(out_file, FileFormat=wdFormatPDF)
                doc.Close()
                word.Quit()

            else:
                # pdf no convert
                pdf_file = docx_file
                a = True

            # pdf to png
            doc = fitz.open(pdf_file)  # open document
            for i, page in enumerate(doc):
                if not os.path.exists(images_directory):
                    os.mkdir(images_directory)  # создаем папку
                pix = page.get_pixmap(dpi=200)  # render page to an image
                pix.save(images_directory + "/" + fpath.stem + f"_{i}.png")

            # png to pdf
            files = glob(images_directory + "/" + fpath.stem + "_*.png")
            iml = []
            print(f"{files=}")
            files = sorted(files)
            for img in files:
                imgs = Image.open(img)
                rgb_im = imgs.convert('RGB')  # to prevent errors: cannot save mode RGBA
                iml.append(rgb_im)
            if a == False: pdf = pdf_file
            else: pdf = pdf_file + ".pdf"
            print(iml)
            image = iml[0]
            iml.pop(0)
            image.save(pdf, "PDF", resolution=100.0, save_all=True, append_images=iml)


        self._update_states()


        return super().dropEvent(event)


if __name__ == '__main__':
    app = QApplication([])

    mw = MainWindow()
    mw.resize(500, 350)
    mw.show()

    app.exec()

c) Добавим защиту от редактирования, пароль для разблокировки "123"


from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget

import comtypes.client

import fitz

from PIL import Image
from glob import glob
import os

import pathlib

# Protection
from hashlib import md5

from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.generic import NameObject, DictionaryObject, ArrayObject, \
    NumberObject, ByteStringObject
from PyPDF4.pdf import _alg33, _alg34, _alg35
from PyPDF4.utils import b_


class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle('Drag & Drop')

        # Даем разрешение на Drop
        self.setAcceptDrops(True)

        self.list_files = QListWidget()
        self.label_total_files = QLabel()

        main_layout = QVBoxLayout()
        main_layout.addWidget(QLabel('Drag and drop the file:'))
        main_layout.addWidget(self.list_files)
        main_layout.addWidget(self.label_total_files)

        central_widget = QWidget()
        central_widget.setLayout(main_layout)

        self.setCentralWidget(central_widget)

        self._update_states()

    def _update_states(self):
        self.label_total_files.setText('Files: {}'.format(self.list_files.count()))

    def dragEnterEvent(self, event):
        # Тут выполняются проверки и дается (или нет) разрешение на Drop

        mime = event.mimeData()

        # Если перемещаются ссылки
        if mime.hasUrls():
            # Разрешаем
            event.acceptProposedAction()

    def dropEvent(self, event):
        # Обработка события Drop

        for url in event.mimeData().urls():
            file_name = url.toLocalFile()
            self.list_files.addItem(file_name)

            # Работа с файлами
            fpath = pathlib.Path(file_name)
            way = os.path.dirname(file_name)

            docx_file = file_name
            pdf_file = way + "/" + fpath.stem + ".pdf"
            images_directory = way + "/images"
            a = False

            # проверка расширения
            if os.path.splitext(docx_file)[1] != '.pdf':
                # doc or docx to pdf
                wdFormatPDF = 17

                in_file = os.path.abspath(docx_file)
                out_file = os.path.abspath(pdf_file)

                word = comtypes.client.CreateObject('Word.Application')
                doc = word.Documents.Open(in_file)
                doc.SaveAs(out_file, FileFormat=wdFormatPDF)
                doc.Close()
                word.Quit()

            else:
                # pdf no convert
                pdf_file = docx_file
                a = True

            # pdf to png
            doc = fitz.open(pdf_file)  # open document
            for i, page in enumerate(doc):
                if not os.path.exists(images_directory):
                    os.mkdir(images_directory)  # создаем папку
                pix = page.get_pixmap(dpi=200)  # render page to an image
                pix.save(images_directory + "/" + fpath.stem + f"_{i}.png")

            # png to pdf
            files = glob(images_directory + "/" + fpath.stem + "_*.png")
            iml = []
            print(f"{files=}")
            files = sorted(files)
            for img in files:
                imgs = Image.open(img)
                rgb_im = imgs.convert('RGB')  # to prevent errors: cannot save mode RGBA
                iml.append(rgb_im)
            if a == False: pdf = pdf_file
            else: pdf = pdf_file + ".pdf"
            print(iml)
            image = iml[0]
            iml.pop(0)
            image.save(pdf, "PDF", resolution=100.0, save_all=True, append_images=iml)

            # Protection
            def encrypt(writer_obj: PdfFileWriter, user_pwd, owner_pwd=None, use_128bit=True):
                """
                Encrypt this PDF file with the PDF Standard encryption handler.

                :param str user_pwd: The "user password", which allows for opening
                    and reading the PDF file with the restrictions provided.
                :param str owner_pwd: The "owner password", which allows for
                    opening the PDF files without any restrictions.  By default,
                    the owner password is the same as the user password.
                :param bool use_128bit: flag as to whether to use 128bit
                    encryption.  When false, 40bit encryption will be used.  By default,
                    this flag is on.
                """
                import time, random
                if owner_pwd == None:
                    owner_pwd = user_pwd
                if use_128bit:
                    V = 2
                    rev = 3
                    keylen = int(128 / 8)
                else:
                    V = 1
                    rev = 2
                    keylen = int(40 / 8)
                # permit copy and printing only:
                P = -44
                O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
                ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
                ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
                writer_obj._ID = ArrayObject((ID_1, ID_2))
                if rev == 2:
                    U, key = _alg34(user_pwd, O, P, ID_1)
                else:
                    assert rev == 3
                    U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
                encrypt = DictionaryObject()
                encrypt[NameObject("/Filter")] = NameObject("/Standard")
                encrypt[NameObject("/V")] = NumberObject(V)
                if V == 2:
                    encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
                encrypt[NameObject("/R")] = NumberObject(rev)
                encrypt[NameObject("/O")] = ByteStringObject(O)
                encrypt[NameObject("/U")] = ByteStringObject(U)
                encrypt[NameObject("/P")] = NumberObject(P)
                writer_obj._encrypt = writer_obj._addObject(encrypt)
                writer_obj._encrypt_key = key

            unmeta = PdfFileReader(pdf)

            writer = PdfFileWriter()
            writer.appendPagesFromReader(unmeta)
            encrypt(writer, '', '123')

            with open(pdf, 'wb') as fp:
                writer.write(fp)


        self._update_states()


        return super().dropEvent(event)


if __name__ == '__main__':
    app = QApplication([])

    mw = MainWindow()
    mw.resize(500, 350)
    mw.show()

    app.exec()

При работе с файловыми путями пользовался материалами с этого сайта и с этого.

Для создания конечной программы .exe использовал эту статью.

Саму программу можно скачать здесь.