當有大量PDF的內容需要擷取,但是資料量較多時,需要耗費較多人力,故透過程式將PDF轉成圖片檔案。
1. 首先,為了美觀先製作 ProgressBar:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from __future__ import print_function import sys import re class ProgressBar(object): DEFAULT = 'Progress: %(bar)s %(percent)3d%%' FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go' def __init__(self, total, width=40, fmt=DEFAULT, symbol='█', output=sys.stderr): assert len(symbol) == 1 self.total = total self.width = width self.symbol = symbol self.output = output self.fmt = re.sub(r'(?P<name>%\(.+?\))d', r'\g<name>%dd' % len(str(total)), fmt) self.current = 0 def __call__(self): percent = self.current / float(self.total) size = int(self.width * percent) remaining = self.total - self.current bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']' args = { 'total': self.total, 'bar': bar, 'current': self.current, 'percent': percent * 100, 'remaining': remaining } print('\r' + self.fmt % args, file=self.output, end='') def done(self): self.current = self.total self() print('', file=self.output) def main(): from time import sleep progress = ProgressBar(5, fmt=ProgressBar.FULL,symbol="█") for x in range(progress.total): progress.current += 1 progress() sleep(0.1) progress.done() if __name__ == "__main__": main() |
進入正題,將PDF檔轉換為圖片檔:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import sys, fitz, os, datetime import argparse from UITools import ProgressBar def PDF2Images(pdfPath, imagePath,image_name="images_"): startTime_pdf2img = datetime.datetime.now()# spending time start counting. pdfDoc = fitz.open(pdfPath) pageCount = pdfDoc.pageCount progress = ProgressBar(pageCount, fmt=ProgressBar.FULL,symbol="█") for pg in range(pageCount): page = pdfDoc[pg] rotate = int(0) # default image size:792X612, dpi=96 zoom_x = 1.5 #(1.33333333-->1188x918) zoom_y = 1.5 mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pix = page.getPixmap(matrix=mat, alpha=False) # check folder exists if not os.path.exists(imagePath): os.makedirs(imagePath) fn = imagePath + '/' + image_name +'%s.png' % pg pix.writePNG(fn) # save imgages # show progress progress.current += 1 progress() endTime_pdf2img = datetime.datetime.now() print("pdf2img time =",(endTime_pdf2img - startTime_pdf2img).seconds,"s") def main(src,dst): PDF2Images(src, dst) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument("-i", "--input", type=str, help='an pdf filename for process source file.') parser.add_argument("-o",'--output', type=str, help='an folder path for output images.') parser.add_argument("-fn",'--filename', type=str, help='an filename prefix for output images.') args = parser.parse_args() shell_args = args._get_kwargs() kwargs = dict(shell_args) main(kwargs['input'],kwargs['output']) |
安裝相關套件
pip install pyMuPDF
執行範例:
python pdf2images.py -i "demo.pdf" -o imgs
文章標籤
全站熱搜
留言列表