Convert doc, docx, pdf, image, xlsx
1. Convert doc, docx to pdf
import os
import comtypes.client
wdFormatPDF = 17
in_file = os.path.abspath('file.doc') # Исходный файл
out_file = os.path.abspath('file.pdf') # Новый файл
word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
2. Convert doc to docx
from win32com import client as wc
import os
w = wc.Dispatch('Word.Application')
file = os.path.join('C:\\Users\\Anna\\PycharmProjects\\pythonProject\\test.doc') # Исходный файл
doc = w.Documents.Open(file)
doc.SaveAs(file+"x", 16) # Новый файл = тот же + "х" в конце к расширению файла
doc.Close()
w.Quit()
3. Convert docx to pdf
from docx2pdf import convert
docx_file = 'input.docx'
pdf_file = 'output.pdf'
convert(docx_file, pdf_file)
4. Convert pdf to docx
from pdf2docx import parse
pdf_file = 'sample.pdf'
docx_file = 'sample.docx'
parse(pdf_file, docx_file)
5. Convert pdf to image
import fitz
import os
file_path = "example.pdf" # исходный файл
doc = fitz.open(file_path) # открываем документ
for i, page in enumerate(doc):
if not os.path.exists("images"): # если папки нет
os.mkdir("images") # создаем папку, в которую сохраним картинки
pix = page.get_pixmap(dpi=200) # преобразуем страницу в изображение
pix.save(f"images/page_{i}.png") # сохраняем в папку
6. Convert image to pdf
from PIL import Image
from glob import glob
import os
files = glob("images/*.png") # открываем картинки из папки images
iml = []
print(f"{files=}")
files = sorted(files)
for img in files:
imgs = Image.open(img)
rgb_im = imgs.convert('RGB') # во избежание ошибок: не удается сохранить режим RGBA
iml.append(rgb_im)
pdf = "output.pdf"
print(iml)
image = iml[0]
iml.pop(0)
image.save(pdf, "PDF" , resolution=100.0, save_all=True, append_images=iml)
os.system(pdf)
7. Convert xlsx to pdf
from win32com import client
import os
file_location = 'C:\\Users\\Anna\\PycharmProjects\\pythonProject\\test.xlsx'
app = client.DispatchEx("Excel.Application")
app.Interactive = False
app.Visible = False
workbook = app.Workbooks.open(file_location)
output = os.path.splitext(file_location)[0]
workbook.ActiveSheet.ExportAsFixedFormat(0, output)
workbook.Close()
8. Ставим защиту от редактирования pdf - только чтение - read only
from hashlib import md5
from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.generic import NameObject, DictionaryObject, ArrayObject, \
NumberObject, ByteStringObject
from PyPDF4.pdf import _alg33, _alg34, _alg35
from PyPDF4.utils import b_
def encrypt(writer_obj: PdfFileWriter, user_pwd, owner_pwd=None, use_128bit=True):
"""
Encrypt this PDF file with the PDF Standard encryption handler.
:param str user_pwd: The "user password", which allows for opening
and reading the PDF file with the restrictions provided.
:param str owner_pwd: The "owner password", which allows for
opening the PDF files without any restrictions. By default,
the owner password is the same as the user password.
:param bool use_128bit: flag as to whether to use 128bit
encryption. When false, 40bit encryption will be used. By default,
this flag is on.
"""
import time, random
if owner_pwd == None:
owner_pwd = user_pwd
if use_128bit:
V = 2
rev = 3
keylen = int(128 / 8)
else:
V = 1
rev = 2
keylen = int(40 / 8)
# permit copy and printing only:
P = -44
O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
writer_obj._ID = ArrayObject((ID_1, ID_2))
if rev == 2:
U, key = _alg34(user_pwd, O, P, ID_1)
else:
assert rev == 3
U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
encrypt = DictionaryObject()
encrypt[NameObject("/Filter")] = NameObject("/Standard")
encrypt[NameObject("/V")] = NumberObject(V)
if V == 2:
encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
encrypt[NameObject("/R")] = NumberObject(rev)
encrypt[NameObject("/O")] = ByteStringObject(O)
encrypt[NameObject("/U")] = ByteStringObject(U)
encrypt[NameObject("/P")] = NumberObject(P)
writer_obj._encrypt = writer_obj._addObject(encrypt)
writer_obj._encrypt_key = key
unmeta = PdfFileReader('Input.pdf')
writer = PdfFileWriter()
writer.appendPagesFromReader(unmeta)
encrypt(writer, '', '123')
with open('Output.pdf', 'wb') as fp:
writer.write(fp)
Исходный код подсмотрел на этом сайте.
7. Пишем программу для получения отсканированного pdf из word или pdf
a) Создадим форму. В нее пользователь будет перетаскивать файлы, которые нужно конвертировать.
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle('Drag & Drop')
# Даем разрешение на Drop
self.setAcceptDrops(True)
self.list_files = QListWidget()
self.label_total_files = QLabel()
main_layout = QVBoxLayout()
main_layout.addWidget(QLabel('Drag and drop the file:'))
main_layout.addWidget(self.list_files)
main_layout.addWidget(self.label_total_files)
central_widget = QWidget()
central_widget.setLayout(main_layout)
self.setCentralWidget(central_widget)
self._update_states()
def _update_states(self):
self.label_total_files.setText('Files: {}'.format(self.list_files.count()))
def dragEnterEvent(self, event):
# Тут выполняются проверки и дается (или нет) разрешение на Drop
mime = event.mimeData()
# Если перемещаются ссылки
if mime.hasUrls():
# Разрешаем
event.acceptProposedAction()
def dropEvent(self, event):
# Обработка события Drop
for url in event.mimeData().urls():
file_name = url.toLocalFile()
self.list_files.addItem(file_name)
self._update_states()
return super().dropEvent(event)
if __name__ == '__main__':
app = QApplication([])
mw = MainWindow()
mw.resize(500, 350)
mw.show()
app.exec()
Исходный код подсмотрел на этом сайте.
b) Добавим в эту форму код для конвертирования с сохранением результатов в папку исходника.
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QWidget, QVBoxLayout, QListWidget, QLineEdit
import fitz
from PIL import Image
from glob import glob
import pathlib
import shutil
import docx2pdf
import os
import comtypes.client
# Protection
from hashlib import md5
from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.generic import NameObject, DictionaryObject, ArrayObject, \
NumberObject, ByteStringObject
from PyPDF4.pdf import _alg33, _alg34, _alg35
from PyPDF4.utils import b_
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle('ScanWord 2.2')
# TextBox
self.textbox = QLineEdit(self)
self.textbox.move(301, 5)
self.textbox.resize(190, 20)
self.textbox.setText("enlib.ru")
# Даем разрешение на Drop
self.setAcceptDrops(True)
self.list_files = QListWidget()
self.label_total_files = QLabel()
main_layout = QVBoxLayout()
main_layout.addWidget(QLabel('Перетащите файлы в окно ниже Пароль:'))
main_layout.addWidget(self.list_files)
main_layout.addWidget(self.label_total_files)
central_widget = QWidget()
central_widget.setLayout(main_layout)
self.setCentralWidget(central_widget)
self._update_states()
def _update_states(self):
self.label_total_files.setText('Files: {}'.format(self.list_files.count()))
def dragEnterEvent(self, event):
# Тут выполняются проверки и дается (или нет) разрешение на Drop
mime = event.mimeData()
# Если перемещаются ссылки
if mime.hasUrls():
# Разрешаем
event.acceptProposedAction()
def dropEvent(self, event):
# Обработка события Drop
for url in event.mimeData().urls():
file_name = url.toLocalFile()
self.list_files.addItem(file_name)
# convert
file = file_name
way = str(pathlib.Path(file).parent)
name = str(pathlib.Path(file).stem)
suf = str(pathlib.Path(file).suffix)
directory = way + "/images"
if not os.path.exists(directory):
os.mkdir(directory)
file_pdf = way + "/" + name + "_1" + ".pdf"
# word -> pdf
if suf == '.docx':
docx2pdf.convert(file, file_pdf)
elif suf == '.doc':
wdFormatPDF = 17
in_file = os.path.abspath(file)
out_file = os.path.abspath(file_pdf)
word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
elif suf == '.pdf':
shutil.copy(file, file_pdf)
# pdf -> images
doc = fitz.open(file_pdf) # open document
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=200) # render page to an image
pix.save(directory + "/" + name + f"_{i}.png")
# images -> pdf
files = glob(directory + "/" + name + "_*.png")
iml = []
print(f"{files=}")
files = sorted(files)
for img in files:
imgs = Image.open(img)
rgb_im = imgs.convert('RGB') # to prevent errors: cannot save mode RGBA
iml.append(rgb_im)
print(iml)
image = iml[0]
iml.pop(0)
image.save(file_pdf, "PDF", resolution=100.0, save_all=True, append_images=iml)
# Protection
def encrypt(writer_obj: PdfFileWriter, user_pwd, owner_pwd=None, use_128bit=True):
"""
Encrypt this PDF file with the PDF Standard encryption handler.
:param str user_pwd: The "user password", which allows for opening
and reading the PDF file with the restrictions provided.
:param str owner_pwd: The "owner password", which allows for
opening the PDF files without any restrictions. By default,
the owner password is the same as the user password.
:param bool use_128bit: flag as to whether to use 128bit
encryption. When false, 40bit encryption will be used. By default,
this flag is on.
"""
import time, random
if owner_pwd == None:
owner_pwd = user_pwd
if use_128bit:
V = 2
rev = 3
keylen = int(128 / 8)
else:
V = 1
rev = 2
keylen = int(40 / 8)
# permit copy and printing only:
P = -44
O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
writer_obj._ID = ArrayObject((ID_1, ID_2))
if rev == 2:
U, key = _alg34(user_pwd, O, P, ID_1)
else:
assert rev == 3
U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
encrypt = DictionaryObject()
encrypt[NameObject("/Filter")] = NameObject("/Standard")
encrypt[NameObject("/V")] = NumberObject(V)
if V == 2:
encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
encrypt[NameObject("/R")] = NumberObject(rev)
encrypt[NameObject("/O")] = ByteStringObject(O)
encrypt[NameObject("/U")] = ByteStringObject(U)
encrypt[NameObject("/P")] = NumberObject(P)
writer_obj._encrypt = writer_obj._addObject(encrypt)
writer_obj._encrypt_key = key
unmeta = PdfFileReader(file_pdf)
writer = PdfFileWriter()
writer.appendPagesFromReader(unmeta)
encrypt(writer, '', self.textbox.text())
with open(file_pdf, 'wb') as fp:
writer.write(fp)
shutil.rmtree(directory + '/') # удаление папки
self._update_states()
return super().dropEvent(event)
if __name__ == '__main__':
app = QApplication([])
mw = MainWindow()
mw.resize(500, 350)
mw.setStyleSheet("QMainWindow {border-top: 1px solid black; background-color: #f2f2eb;}")
mw.show()
app.exec()
Для создания конечной программы .exe использовал эту статью.