Skip to content

Commit 87ff69e

Browse files
authored
Add files via upload
0 parents  commit 87ff69e

File tree

8 files changed

+180
-0
lines changed

8 files changed

+180
-0
lines changed

README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Overview
2+
3+
This project is designed to parse text from various media types: audio (`.wav`), video (`.mp4`), and text documents (`.pdf`). The implementation utilizes Python and its libraries, relying exclusively on free APIs and libraries for unlimited usage.
4+
5+
## Features
6+
7+
1. **_Parse Text from Video_**
8+
- Implementation:
9+
Extracts audio from the video in .wav format using the Moviepy library.
10+
Processes the audio through text extraction, reusing the logic implemented for audio parsing.
11+
- Issues Faced:
12+
Alternative libraries, such as SpeechRecognition, proved less efficient than Moviepy.
13+
2. **_Parse Text from Audio_**
14+
- Implementation:
15+
Utilized the SpeechRecognition library with the Google Web Speech-to-Text service for audio transcription.
16+
Dealt with limitations of 120-second processing by splitting audio into 115-second chunks using pydub.silence.
17+
Errors such as UnknownValueError and RequestError were managed with appropriate fallbacks.
18+
- Challenges:
19+
Processing long audio files is time-intensive (e.g., 15-minute audio may take 30-40 minutes).
20+
Paid APIs with better performance were avoided due to limited free usage.
21+
3. **_Parse Text from PDF_**
22+
- Implementation:
23+
Used the PyMuPDF library to extract clean and structured text.
24+
- Challenges:
25+
Preservation of tabular data was not supported natively. However, the row structure of tabular data was maintained by outputting each column in a new line.
26+
Alternatives like PyPDF2, Textract, and Tika were tested but found less efficient.
27+
28+
## Libraries and Tools
29+
30+
- Text Extraction: `SpeechRecognition`, `PyMuPDF`
31+
- Audio Processing: `Moviepy`, `pydub`
32+
- Error Handling: Custom error management for transcription inconsistencies
33+
34+
## Known Limitations
35+
36+
Limited accuracy and speed of the Google Web Speech-to-Text service. Inefficiency in preserving tabular data in PDF files. For further improvements, exploring better transcription APIs and tabular-specific libraries like Tabulapy is recommended.

extract_text_from_audio.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import speech_recognition as sr
2+
import os
3+
from pydub import AudioSegment
4+
from pydub.silence import split_on_silence
5+
from pydub import AudioSegment, silence
6+
7+
8+
# segmenting audio since google speech to text cannot handle processing of audio duration more than 120 seconds.
9+
def segmenting_audio(
10+
path, folder, interval=115000, min_silence_len=500, silence_thresh=-16
11+
):
12+
audio_provided = AudioSegment.from_wav(path)
13+
try:
14+
os.mkdir(folder)
15+
except FileExistsError:
16+
pass
17+
os.chdir(folder)
18+
i = 0 # for iteration
19+
k = 0 # indexing the chunks
20+
while i < len(audio_provided):
21+
start = i
22+
end = min(start + interval, len(audio_provided))
23+
# detecting silence intervals
24+
silence_intervals = silence.detect_silence(
25+
audio_provided[start:end],
26+
min_silence_len=min_silence_len,
27+
silence_thresh=silence_thresh,
28+
)
29+
if silence_intervals:
30+
end_silence = silence_intervals[0][1]
31+
end = start + end_silence
32+
segment = audio_provided[start:end]
33+
segment.export(f"chunk_{k}.wav", format="wav")
34+
print(f"finished generating chunk_{k}.wav")
35+
k += 1
36+
i = end
37+
os.chdir("..")
38+
final_text = transcribe_audio_chunks(folder)
39+
return final_text
40+
41+
42+
def transcribe_audio_chunks(folder):
43+
recognizer = sr.Recognizer()
44+
45+
filenames = os.listdir(folder)
46+
filenames.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
47+
48+
text_transcribed_as_whole = ""
49+
for filename in filenames:
50+
if filename.endswith(".wav"):
51+
chunk_file_path = os.path.join(folder, filename)
52+
53+
# Load the audio file
54+
with sr.AudioFile(chunk_file_path) as source:
55+
audio_data = recognizer.record(source)
56+
57+
try:
58+
text = recognizer.recognize_google(audio_data)
59+
text_transcribed_as_whole += text
60+
text_transcribed_as_whole += " "
61+
except sr.UnknownValueError:
62+
text_transcribed_as_whole += " [incoherent] "
63+
except sr.RequestError as e:
64+
print(
65+
f"Could not request results from Google Web Speech API for {chunk_file_path}; {e}"
66+
)
67+
return text_transcribed_as_whole

extract_text_from_pdf.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import fitz # PyMuPDF
2+
3+
4+
def extract_text_from_pdf(pdf_path):
5+
text = ""
6+
with fitz.open(pdf_path) as pdf_document:
7+
for page_num in range(pdf_document.page_count):
8+
page = pdf_document.load_page(page_num)
9+
text += page.get_text()
10+
print(page_num + 1)
11+
# page = pdf_document.load_page(54)
12+
# text = page.get_text()
13+
return text

extract_text_from_video.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import moviepy.editor as mp
2+
import speech_recognition as sr
3+
4+
5+
def extract_audio_from_video(file_path):
6+
# to extract audio from video
7+
video = mp.VideoFileClip(file_path)
8+
audio_file_path = "output.wav"
9+
video.audio.write_audiofile(audio_file_path)
10+
return audio_file_path

main.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import speech_recognition as sr
2+
import os
3+
from pydub import AudioSegment
4+
from pydub.silence import split_on_silence
5+
from pydub import AudioSegment, silence
6+
import fitz
7+
import moviepy.editor as mp
8+
import sys
9+
import shutil
10+
11+
from extract_text_from_video import extract_audio_from_video
12+
from extract_text_from_audio import segmenting_audio
13+
from extract_text_from_pdf import extract_text_from_pdf
14+
from write_to_file import write_to_file
15+
16+
17+
def main():
18+
multimedia = sys.argv[1]
19+
file_name = sys.argv[2]
20+
if multimedia == "VIDEO":
21+
audio_path = extract_audio_from_video(file_name)
22+
transcription = segmenting_audio(audio_path, "audio_segments")
23+
write_to_file(transcription, multimedia)
24+
shutil.rmtree(f"audio_segments")
25+
elif multimedia == "AUDIO":
26+
transcription = segmenting_audio(file_name, "audio_segments")
27+
write_to_file(transcription, multimedia)
28+
shutil.rmtree(f"audio_segments")
29+
elif multimedia == "PDF":
30+
transcription = extract_text_from_pdf(file_name)
31+
write_to_file(transcription, multimedia)
32+
33+
34+
if __name__ == "__main__":
35+
main()

mp3_to_wav.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from pydub import AudioSegment
2+
3+
4+
def convert_mp3_to_wav(mp3_file_path, wav_file_path):
5+
# Load the MP3 file
6+
audio = AudioSegment.from_mp3(mp3_file_path)
7+
8+
# Export the audio as WAV
9+
audio.export(wav_file_path, format="wav")
10+
11+
12+
# Example usage
13+
mp3_file_path = "sunflower.mp3"
14+
wav_file_path = "output.wav"
15+
convert_mp3_to_wav(mp3_file_path, wav_file_path)

report.pdf

112 KB
Binary file not shown.

write_to_file.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
def write_to_file(content, multimedia):
2+
with open(multimedia + "_output.txt", "w", encoding="utf-8") as f:
3+
f.write("Transcript :\n")
4+
f.write(content)

0 commit comments

Comments
 (0)