Skip to content

Commit 6cbb0d8

Browse files
authored
Update PyMuPDF
1 parent 1c5a4e1 commit 6cbb0d8

File tree

1 file changed

+28
-0
lines changed

1 file changed

+28
-0
lines changed

PyMuPDF

+28
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,31 @@ for file_name in pdf_files:
2222
with fitz.open(os.path.join(pdf_dir, file_name)) as doc:
2323
print_annotation_details(doc)
2424
print("=" * 50)
25+
26+
27+
import fitz
28+
29+
def extract_text_and_images(doc):
30+
for page in doc:
31+
# Extract text
32+
text = page.get_text("text")
33+
print("Extracted text:")
34+
print(text)
35+
36+
# Extract images
37+
image_list = page.get_images(full=True)
38+
print("Found images:")
39+
for img in image_list:
40+
xref = img[0] # XREF of the image
41+
base_image = doc.extract_image(xref)
42+
print(f"Image {xref} details: {base_image['width']}x{base_image['height']} pixels")
43+
# Optionally save the image to disk
44+
image_filename = f'image_{xref}.png'
45+
with open(image_filename, 'wb') as imgfile:
46+
imgfile.write(base_image['image'])
47+
48+
# Open the PDF
49+
file_path = 'path_to_your_pdf.pdf' # Specify the path to your PDF
50+
doc = fitz.open(file_path)
51+
extract_text_and_images(doc)
52+

0 commit comments

Comments
 (0)