Add files via upload

jahnabiroy · web-flow · commit 87ff69ea1582 · 2024-12-05T12:11:49.000+05:30
diff --git a/README.md b/README.md
@@ -0,0 +1,36 @@
+# Overview
+
+This project is designed to parse text from various media types: audio (`.wav`), video (`.mp4`), and text documents (`.pdf`). The implementation utilizes Python and its libraries, relying exclusively on free APIs and libraries for unlimited usage.
+
+## Features
+
+1. **_Parse Text from Video_**
+   - Implementation:
+     Extracts audio from the video in .wav format using the Moviepy library.
+     Processes the audio through text extraction, reusing the logic implemented for audio parsing.
+   - Issues Faced:
+     Alternative libraries, such as SpeechRecognition, proved less efficient than Moviepy.
+2. **_Parse Text from Audio_**
+   - Implementation:
+     Utilized the SpeechRecognition library with the Google Web Speech-to-Text service for audio transcription.
+     Dealt with limitations of 120-second processing by splitting audio into 115-second chunks using pydub.silence.
+     Errors such as UnknownValueError and RequestError were managed with appropriate fallbacks.
+   - Challenges:
+     Processing long audio files is time-intensive (e.g., 15-minute audio may take 30-40 minutes).
+     Paid APIs with better performance were avoided due to limited free usage.
+3. **_Parse Text from PDF_**
+   - Implementation:
+     Used the PyMuPDF library to extract clean and structured text.
+   - Challenges:
+     Preservation of tabular data was not supported natively. However, the row structure of tabular data was maintained by outputting each column in a new line.
+     Alternatives like PyPDF2, Textract, and Tika were tested but found less efficient.
+
+## Libraries and Tools
+
+- Text Extraction: `SpeechRecognition`, `PyMuPDF`
+- Audio Processing: `Moviepy`, `pydub`
+- Error Handling: Custom error management for transcription inconsistencies
+
+## Known Limitations
+
+Limited accuracy and speed of the Google Web Speech-to-Text service. Inefficiency in preserving tabular data in PDF files. For further improvements, exploring better transcription APIs and tabular-specific libraries like Tabulapy is recommended.
diff --git a/extract_text_from_audio.py b/extract_text_from_audio.py
@@ -0,0 +1,67 @@
+import speech_recognition as sr
+import os
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from pydub import AudioSegment, silence
+
+
+# segmenting audio since google speech to text cannot handle processing of audio duration more than 120 seconds.
+def segmenting_audio(
+    path, folder, interval=115000, min_silence_len=500, silence_thresh=-16
+):
+    audio_provided = AudioSegment.from_wav(path)
+    try:
+        os.mkdir(folder)
+    except FileExistsError:
+        pass
+    os.chdir(folder)
+    i = 0  # for iteration
+    k = 0  # indexing the chunks
+    while i < len(audio_provided):
+        start = i
+        end = min(start + interval, len(audio_provided))
+        # detecting silence intervals
+        silence_intervals = silence.detect_silence(
+            audio_provided[start:end],
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh,
+        )
+        if silence_intervals:
+            end_silence = silence_intervals[0][1]
+            end = start + end_silence
+        segment = audio_provided[start:end]
+        segment.export(f"chunk_{k}.wav", format="wav")
+        print(f"finished generating chunk_{k}.wav")
+        k += 1
+        i = end
+    os.chdir("..")
+    final_text = transcribe_audio_chunks(folder)
+    return final_text
+
+
+def transcribe_audio_chunks(folder):
+    recognizer = sr.Recognizer()
+
+    filenames = os.listdir(folder)
+    filenames.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
+
+    text_transcribed_as_whole = ""
+    for filename in filenames:
+        if filename.endswith(".wav"):
+            chunk_file_path = os.path.join(folder, filename)
+
+            # Load the audio file
+            with sr.AudioFile(chunk_file_path) as source:
+                audio_data = recognizer.record(source)
+
+                try:
+                    text = recognizer.recognize_google(audio_data)
+                    text_transcribed_as_whole += text
+                    text_transcribed_as_whole += " "
+                except sr.UnknownValueError:
+                    text_transcribed_as_whole += " [incoherent] "
+                except sr.RequestError as e:
+                    print(
+                        f"Could not request results from Google Web Speech API for {chunk_file_path}; {e}"
+                    )
+    return text_transcribed_as_whole
diff --git a/extract_text_from_pdf.py b/extract_text_from_pdf.py
@@ -0,0 +1,13 @@
+import fitz  # PyMuPDF
+
+
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    with fitz.open(pdf_path) as pdf_document:
+        for page_num in range(pdf_document.page_count):
+            page = pdf_document.load_page(page_num)
+            text += page.get_text()
+            print(page_num + 1)
+        # page = pdf_document.load_page(54)
+        # text = page.get_text()
+    return text
diff --git a/extract_text_from_video.py b/extract_text_from_video.py
@@ -0,0 +1,10 @@
+import moviepy.editor as mp
+import speech_recognition as sr
+
+
+def extract_audio_from_video(file_path):
+    # to extract audio from video
+    video = mp.VideoFileClip(file_path)
+    audio_file_path = "output.wav"
+    video.audio.write_audiofile(audio_file_path)
+    return audio_file_path
diff --git a/main.py b/main.py
@@ -0,0 +1,35 @@
+import speech_recognition as sr
+import os
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from pydub import AudioSegment, silence
+import fitz
+import moviepy.editor as mp
+import sys
+import shutil
+
+from extract_text_from_video import extract_audio_from_video
+from extract_text_from_audio import segmenting_audio
+from extract_text_from_pdf import extract_text_from_pdf
+from write_to_file import write_to_file
+
+
+def main():
+    multimedia = sys.argv[1]
+    file_name = sys.argv[2]
+    if multimedia == "VIDEO":
+        audio_path = extract_audio_from_video(file_name)
+        transcription = segmenting_audio(audio_path, "audio_segments")
+        write_to_file(transcription, multimedia)
+        shutil.rmtree(f"audio_segments")
+    elif multimedia == "AUDIO":
+        transcription = segmenting_audio(file_name, "audio_segments")
+        write_to_file(transcription, multimedia)
+        shutil.rmtree(f"audio_segments")
+    elif multimedia == "PDF":
+        transcription = extract_text_from_pdf(file_name)
+        write_to_file(transcription, multimedia)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mp3_to_wav.py b/mp3_to_wav.py
@@ -0,0 +1,15 @@
+from pydub import AudioSegment
+
+
+def convert_mp3_to_wav(mp3_file_path, wav_file_path):
+    # Load the MP3 file
+    audio = AudioSegment.from_mp3(mp3_file_path)
+
+    # Export the audio as WAV
+    audio.export(wav_file_path, format="wav")
+
+
+# Example usage
+mp3_file_path = "sunflower.mp3"
+wav_file_path = "output.wav"
+convert_mp3_to_wav(mp3_file_path, wav_file_path)
diff --git a/report.pdf b/report.pdf
diff --git a/write_to_file.py b/write_to_file.py
@@ -0,0 +1,4 @@
+def write_to_file(content, multimedia):
+    with open(multimedia + "_output.txt", "w", encoding="utf-8") as f:
+        f.write("Transcript :\n")
+        f.write(content)