|
| 1 | +import PyPDF2 |
| 2 | +import fitz # PyMuPDF |
| 3 | +from docx import Document |
| 4 | +from PIL import Image |
| 5 | +from docx.shared import Pt |
| 6 | +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT |
| 7 | + |
| 8 | +def pdf_to_image(pdf_path, image_path): |
| 9 | + pdf_document = fitz.open(pdf_path) |
| 10 | + for page_number in range(len(pdf_document)): |
| 11 | + page = pdf_document[page_number] |
| 12 | + image = page.get_pixmap() |
| 13 | + image.save(f"{image_path}_page_{page_number + 1}.png") |
| 14 | + |
| 15 | +def pdf_to_text(pdf_path, text_path): |
| 16 | + with open(pdf_path, 'rb') as file: |
| 17 | + reader = PyPDF2.PdfReader(file) |
| 18 | + text = '' |
| 19 | + for page_number in range(len(reader.pages)): |
| 20 | + text += reader.pages[page_number].extract_text() |
| 21 | + |
| 22 | + with open(text_path, 'w', encoding='utf-8') as text_file: |
| 23 | + text_file.write(text) |
| 24 | + |
| 25 | +def text_to_document(text_path, doc_path): |
| 26 | + document = Document() |
| 27 | + with open(text_path, 'r', encoding='utf-8') as text_file: |
| 28 | + for line in text_file: |
| 29 | + paragraph = document.add_paragraph(line.strip()) |
| 30 | + paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT |
| 31 | + run = paragraph.runs[0] |
| 32 | + run.font.size = Pt(12) # Set font size to 12pt (adjust as needed) |
| 33 | + # You can add more formatting options here |
| 34 | + |
| 35 | + document.save(doc_path) |
| 36 | + |
| 37 | +# Example usage |
| 38 | +pdf_file = r"C:\Users\DELL\Downloads\kavya junuthula (3).pdf" |
| 39 | +image_output_path =r"C:\Users\DELL\Downloads" |
| 40 | +text_output_path=r"C:\Users\DELL\textfile.txt" |
| 41 | +doc_output_path =r"C:\Users\DELL\document.docx" |
| 42 | + |
| 43 | +pdf_to_image(pdf_file, image_output_path) |
| 44 | +pdf_to_text(pdf_file, text_output_path) |
| 45 | +text_to_document(text_output_path, doc_output_path) |
| 46 | +Footer |
| 47 | +© 2024 GitHub, Inc. |
| 48 | +Footer navigation |
| 49 | +Terms |
| 50 | +Privacy |
| 51 | +Security |
| 52 | +Status |
| 53 | +import PyPDF2 |
| 54 | +import fitz # PyMuPDF |
| 55 | +from docx import Document |
| 56 | +from PIL import Image |
| 57 | +from docx.shared import Pt |
| 58 | +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT |
| 59 | + |
| 60 | +def pdf_to_image(pdf_path, image_path): |
| 61 | + pdf_document = fitz.open(pdf_path) |
| 62 | + for page_number in range(len(pdf_document)): |
| 63 | + page = pdf_document[page_number] |
| 64 | + image = page.get_pixmap() |
| 65 | + image.save(f"{image_path}_page_{page_number + 1}.png") |
| 66 | + |
| 67 | +def pdf_to_text(pdf_path, text_path): |
| 68 | + with open(pdf_path, 'rb') as file: |
| 69 | + reader = PyPDF2.PdfReader(file) |
| 70 | + text = '' |
| 71 | + for page_number in range(len(reader.pages)): |
| 72 | + text += reader.pages[page_number].extract_text() |
| 73 | + |
| 74 | + with open(text_path, 'w', encoding='utf-8') as text_file: |
| 75 | + text_file.write(text) |
| 76 | + |
| 77 | +def text_to_document(text_path, doc_path): |
| 78 | + document = Document() |
| 79 | + with open(text_path, 'r', encoding='utf-8') as text_file: |
| 80 | + for line in text_file: |
| 81 | + paragraph = document.add_paragraph(line.strip()) |
| 82 | + paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT |
| 83 | + run = paragraph.runs[0] |
| 84 | + run.font.size = Pt(12) # Set font size to 12pt (adjust as needed) |
| 85 | + # You can add more formatting options here |
| 86 | + |
| 87 | + document.save(doc_path) |
| 88 | + |
| 89 | +# Example usage |
| 90 | +pdf_file = r"C:\Users\DELL\Downloads\Deepa Ajmeera (3).pdf" |
| 91 | +image_output_path =r"C:\Users\DELL\Downloads" |
| 92 | +text_output_path=r"C:\Users\DELL\textfile.txt" |
| 93 | +doc_output_path =r"C:\Users\DELL\document.docx" |
| 94 | + |
| 95 | +pdf_to_image(pdf_file, image_output_path) |
| 96 | +pdf_to_text(pdf_file, text_output_path) |
| 97 | +text_to_document(text_output_path, doc_output_path) |
0 commit comments