|
| 1 | +import PyPDF2 |
| 2 | +import fitz # PyMuPDF |
| 3 | +from docx import Document |
| 4 | +from PIL import Image |
| 5 | +from docx.shared import Pt |
| 6 | +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT |
| 7 | + |
| 8 | +def pdf_to_image(pdf_path, image_path): |
| 9 | + pdf_document = fitz.open(pdf_path) |
| 10 | + for page_number in range(len(pdf_document)): |
| 11 | + page = pdf_document[page_number] |
| 12 | + image = page.get_pixmap() |
| 13 | + image.save(f"{image_path}page{page_number + 1}.png") |
| 14 | + |
| 15 | +def pdf_to_text(pdf_path, text_path): |
| 16 | + with open(pdf_path, 'rb') as file: |
| 17 | + reader = PyPDF2.PdfReader(file) |
| 18 | + text = '' |
| 19 | + for page_number in range(len(reader.pages)): |
| 20 | + text += reader.pages[page_number].extract_text() |
| 21 | + |
| 22 | + with open(text_path, 'w', encoding='utf-8') as text_file: |
| 23 | + text_file.write(text) |
| 24 | + |
| 25 | +def text_to_document(text_path, doc_path): |
| 26 | + document = Document() |
| 27 | + with open(text_path, 'r', encoding='utf-8') as text_file: |
| 28 | + for line in text_file: |
| 29 | + paragraph = document.add_paragraph(line.strip()) |
| 30 | + paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT |
| 31 | + run = paragraph.runs[0] |
| 32 | + run.font.size = Pt(12) # Set font size to 12pt (adjust as needed) |
| 33 | + # You can add more formatting options here |
| 34 | + |
| 35 | + document.save(doc_path) |
| 36 | + |
| 37 | +# Example usage |
| 38 | +pdf_file = r"C:\Users\DELL\Downloads\Roshini Khammam.pdf" |
| 39 | +image_output_path =r"C:\Users\DELL\Downloads" |
| 40 | +text_output_path=r"C:\Users\DELL\textfile.txt" |
| 41 | +doc_output_path =r"C:\Users\DELL\document.docx" |
| 42 | + |
| 43 | +pdf_to_image(pdf_file, image_output_path) |
| 44 | +pdf_to_text(pdf_file, text_output_path) |
| 45 | +text_to_document(text_output_path, doc_output_path) |
| 46 | + |
0 commit comments