當有大量PDF的內容需要擷取,但是資料量較多時,需要耗費較多人力,故透過程式將PDF轉成圖片檔案。

 

1. 首先,為了美觀先製作 ProgressBar

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from __future__ import print_function
import sys
import re

class ProgressBar(object):
    DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
    FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'

    def __init__(self, total, width=40, fmt=DEFAULT, symbol='█',
                 output=sys.stderr):
        assert len(symbol) == 1

        self.total = total
        self.width = width
        self.symbol = symbol
        self.output = output
        self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
            r'\g<name>%dd' % len(str(total)), fmt)

        self.current = 0

    def __call__(self):
        percent = self.current / float(self.total)
        size = int(self.width * percent)
        remaining = self.total - self.current
        bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']'

        args = {
            'total': self.total,
            'bar': bar,
            'current': self.current,
            'percent': percent * 100,
            'remaining': remaining
        }
        print('\r' + self.fmt % args, file=self.output, end='')

    def done(self):
        self.current = self.total
        self()
        print('', file=self.output)
        

def main():
    from time import sleep

    progress = ProgressBar(5, fmt=ProgressBar.FULL,symbol="█")

    for x in range(progress.total):
        progress.current += 1
        progress()
        sleep(0.1)
    progress.done()
    
if __name__ == "__main__":
    main()

 

進入正題,將PDF檔轉換為圖片檔:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import sys, fitz, os, datetime
import argparse
from UITools import ProgressBar

def PDF2Images(pdfPath, imagePath,image_name="images_"):
    
    startTime_pdf2img = datetime.datetime.now()# spending time start counting.
    pdfDoc = fitz.open(pdfPath)
    
    pageCount = pdfDoc.pageCount
    progress = ProgressBar(pageCount, fmt=ProgressBar.FULL,symbol="█")
    
    for pg in range(pageCount):
        page = pdfDoc[pg]
        rotate = int(0)
        # default image size:792X612, dpi=96
        zoom_x = 1.5 #(1.33333333-->1188x918)
        zoom_y = 1.5
        mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
        pix = page.getPixmap(matrix=mat, alpha=False)

        # check folder exists
        if not os.path.exists(imagePath):
            os.makedirs(imagePath)

        fn = imagePath + '/' + image_name +'%s.png' % pg
        pix.writePNG(fn) # save imgages
        
        # show progress
        progress.current += 1
        progress()
    endTime_pdf2img = datetime.datetime.now()   
    
    print("pdf2img time =",(endTime_pdf2img - startTime_pdf2img).seconds,"s")

def main(src,dst):
    PDF2Images(src, dst)
    
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument("-i", "--input", type=str,
                        help='an pdf filename for process source file.')
    parser.add_argument("-o",'--output', type=str,
                        help='an folder path for output images.')
    parser.add_argument("-fn",'--filename', type=str,
                        help='an filename prefix for output images.')

    args = parser.parse_args()
    shell_args = args._get_kwargs()	
    kwargs = dict(shell_args)
    
    main(kwargs['input'],kwargs['output'])
    
安裝相關套件

pip install pyMuPDF

 

執行範例:

python pdf2images.py -i "demo.pdf" -o imgs
 
 

 

 

 

arrow
arrow
    文章標籤
    python
    全站熱搜

    Lung-Yu,Tsai 發表在 痞客邦 留言(0) 人氣()