I want to implement PDF automation through Python in which when my below code gets executed its taking any random images from the whole PDF its not following sequence of the images page wise in the given PDF.
Please find my attached PDF images executed from below code.
from PIL import Image
import fitz
import os
def start():
doc = fitz.open("cs2103g0052_019_549291_ca_cs_sb_sb_fy22q2wk7_oa_showcase-premium-fr_XXXxXXX_jsos.pdf")
try:
os.mkdir("Extract Images")
except:
pass
DIR = "Extract Images"
for i in range(len(doc)):
for img in doc.getPageImageList(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n < 5: # this is GRAY or RGB
# file_path = os.path.join(DIR, "screenshot%d.png" % (count + 1))
pix.writePNG(os.path.join(DIR, "p%s-%s.png" % (i, xref)))
else: # CMYK: convert to RGB first
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writePNG(os.path.join(DIR, "p%s-%s.png" % (i, xref)))
pix1 = None
pix = None
start()
Expected Output :- I want to extract images in sequence as given in PDF.
For Example – First it should pick Page 1 images then Page 2 and so on in sequence wise manner.
def start():
print("Start working....")
# doc = fitz.open("cs2102g0065_016_549355_ca_cs_sb_sb_fy22q2wk11_oa_bfij-proper-fr_XXXxXXX_jsos.pdf")
# pdf_file = fitz.open(
# r"C:Userskunal.joshiPycharmProjects1190_PDF to gifcs2103g0052_019_549291_ca_cs_sb_sb_fy22q2wk7_oa_showcase-premium-fr_XXXxXXX_jsos.pdf")
input = path.get()
pdf_file = fitz.open(input)
try:
os.mkdir("Extract Images")
except:
pass
DIR = "Extract Images"
for page_index in range(len(pdf_file)):
# print(page_index)
# get the page itself
page = pdf_file[page_index]
image_list = page.getImageList()
# printing number of images found in this page
if image_list:
print(f"[+] Found {len(image_list)} images in page {page_index}")
else:
print("[!] No images found on the given pdf page", page_index)
for image_index, img in enumerate(page.getImageList(), start=1):
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extractImage(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
# load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# save it to local disk
# image.save(open(f"image{page_index + 1}_{image_index}.{image_ext}", "wb"))
image.save(os.path.join(DIR, f"image{page_index + 1}_{image_index}.{image_ext}"))
# image.save(os.path.join(DIR,image_ext))