Skip to content

Commit 54db5f6

Browse files
Merge pull request #2 from openize-com/muhammadumar-patch
MarkItDown: OpenAI and Claude Strategy Integration
2 parents 00aef24 + 47651f1 commit 54db5f6

File tree

6 files changed

+143
-42
lines changed

6 files changed

+143
-42
lines changed

packages/markitdown/setup.cfg

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11

22
[metadata]
33
name = openize-markitdown-python
4-
version = 25.4.0
4+
version = 25.5.0
5+
56
author = Openize
67
author_email = packages@openize.com
78
description = A document converter for Word, PDF, Excel, and PowerPoint to Markdown.
8-
long_description = file:README.md
9+
long_description = file:README.md
910
long_description_content_type = text/markdown
1011
license = MIT
1112
license_files = LICENSE
@@ -24,14 +25,15 @@ classifiers =
2425

2526
[options]
2627
package_dir =
27-
= src
28+
= src
2829
packages = find_namespace:
2930
python_requires = >=3.7
3031
install_requires =
3132
aspose-words>=23.0.0
3233
aspose-cells-python>=23.0.0
3334
aspose-slides>=23.0.0
3435
openai>=1.0.0
36+
anthropic>=3.0.0
3537

3638
[options.packages.find]
3739
where = src

packages/markitdown/setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ def install_if_missing(package, module_name=None):
1818
dependencies = [
1919
("aspose-cells-python", "asposecellspython"),
2020
("aspose-words", "asposewords"),
21-
("aspose-slides", "asposeslides")
21+
("aspose-slides", "asposeslides"),
22+
("openai", "openai"),
23+
("anthropic", "anthropic"),
2224
]
2325

2426
# Install missing dependencies before proceeding
Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,34 @@
11
import os
2-
32
from processor import DocumentProcessor
4-
3+
from llm_strategy import LLMFactory, SaveLocally
4+
import logging
55

66
class MarkItDown:
7-
def __init__(self, output_dir):
7+
def __init__(self, output_dir, llm_client_name=None):
88
self.output_dir = output_dir
9+
self.llm_client_name = llm_client_name
10+
self.llm_client = None
911

10-
def convert_document(self, input_file, insert_into_llm=False):
12+
if llm_client_name:
13+
try:
14+
self.llm_client = LLMFactory.get_llm(llm_client_name)
15+
except ValueError as e:
16+
logging.error(f"LLM client error: {e}")
17+
self.llm_client = SaveLocally()
18+
else:
19+
self.llm_client = SaveLocally()
20+
21+
def convert_document(self, input_file):
1122
"""Run the document conversion process."""
1223
processor = DocumentProcessor(self.output_dir)
13-
processor.process_document(input_file, insert_into_llm)
24+
md_file = processor.process_document(input_file)
25+
26+
if md_file and self.llm_client:
27+
self.llm_client.process(md_file)
1428

15-
def convert_directory(self, input_dir: str, insert_into_llm: bool = False):
29+
def convert_directory(self, input_dir: str):
1630
supported_exts = [".docx", ".pdf", ".xlsx", ".pptx"]
1731
for filename in os.listdir(input_dir):
1832
filepath = os.path.join(input_dir, filename)
1933
if os.path.isfile(filepath) and os.path.splitext(filename)[1].lower() in supported_exts:
20-
self.convert_document(filepath, insert_into_llm)
34+
self.convert_document(filepath)

packages/markitdown/src/openize/markitdown/llm_strategy.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from abc import ABC, abstractmethod
44
import openai
55

6+
# Placeholder for Claude SDK import
7+
# from claude_sdk import ClaudeClient as ClaudeAPIClient
8+
69
class LLMStrategy(ABC):
710
@abstractmethod
811
def process(self, md_file):
@@ -12,10 +15,10 @@ class SaveLocally(LLMStrategy):
1215
def process(self, md_file):
1316
logging.info(f"File saved locally: {md_file}")
1417

15-
class InsertIntoLLM(LLMStrategy):
18+
class OpenAIClient(LLMStrategy):
1619
def __init__(self):
17-
self.api_key = os.getenv("OPENAI_API_KEY") # Read from environment
18-
self.model = os.getenv("OPENAI_MODEL", "gpt-4") # Default model if not set
20+
self.api_key = os.getenv("OPENAI_API_KEY")
21+
self.model = os.getenv("OPENAI_MODEL", "gpt-4")
1922

2023
if not self.api_key:
2124
raise ValueError("Missing OpenAI API key. Please set it in the environment.")
@@ -40,11 +43,51 @@ def process(self, md_file):
4043
)
4144

4245
llm_response = response.choices[0].message.content
43-
logging.info(f"LLM Response for {md_file}: {llm_response}")
46+
logging.info(f"OpenAI Response for {md_file}: {llm_response}")
4447

4548
except FileNotFoundError:
4649
logging.error(f"Markdown file not found: {md_file}")
4750
except openai.OpenAIError as e:
4851
logging.error(f"OpenAI API error while processing {md_file}: {e}")
4952
except Exception as e:
5053
logging.exception(f"Unexpected error processing {md_file}: {e}")
54+
55+
class ClaudeClient(LLMStrategy):
56+
def __init__(self):
57+
self.api_key = os.getenv("CLAUDE_API_KEY")
58+
self.model = os.getenv("CLAUDE_MODEL", "claude-v1")
59+
60+
if not self.api_key:
61+
raise ValueError("Missing Claude API key. Please set it in the environment.")
62+
63+
# Initialize Claude client here (replace with actual SDK code)
64+
# self.client = ClaudeAPIClient(api_key=self.api_key)
65+
66+
def process(self, md_file):
67+
try:
68+
with open(md_file, "r", encoding="utf-8") as file:
69+
content = file.read()
70+
71+
# Replace with actual Claude API call
72+
# response = self.client.complete(prompt=content, model=self.model)
73+
74+
# Dummy placeholder response
75+
response_text = f"Simulated Claude response for {md_file}"
76+
77+
logging.info(f"Claude Response for {md_file}: {response_text}")
78+
79+
except FileNotFoundError:
80+
logging.error(f"Markdown file not found: {md_file}")
81+
except Exception as e:
82+
logging.exception(f"Unexpected error processing {md_file}: {e}")
83+
84+
class LLMFactory:
85+
@staticmethod
86+
def get_llm(client_name: str) -> LLMStrategy:
87+
client_name = client_name.lower()
88+
if client_name == "openai":
89+
return OpenAIClient()
90+
elif client_name == "claude":
91+
return ClaudeClient()
92+
else:
93+
raise ValueError(f"Unknown LLM client: {client_name}")

packages/markitdown/src/openize/markitdown/main.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from core import MarkItDown
66
from license_manager import LicenseManager
77

8-
98
def ask_user_boolean(question):
109
"""Ask the user a yes/no question and return True/False."""
1110
while True:
@@ -17,7 +16,6 @@ def ask_user_boolean(question):
1716
else:
1817
print("Invalid input. Please enter 'yes' or 'no'.")
1918

20-
2119
def ensure_env_variable(var_name, prompt_message, default=None):
2220
"""Ensure an environment variable is set, otherwise ask the user and persist it."""
2321
value = os.getenv(var_name)
@@ -31,7 +29,6 @@ def ensure_env_variable(var_name, prompt_message, default=None):
3129

3230
return value
3331

34-
3532
def set_env_variable(var_name, value):
3633
"""Set an environment variable persistently on Windows and Linux/macOS."""
3734
os.environ[var_name] = value # Set for the current session
@@ -42,7 +39,6 @@ def set_env_variable(var_name, value):
4239
os.system(f'echo "export {var_name}={value}" >> ~/.bashrc')
4340
os.system(f'echo "export {var_name}={value}" >> ~/.profile')
4441

45-
4642
def main():
4743
"""Entry point for the CLI tool."""
4844
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -52,7 +48,8 @@ def main():
5248
input_group.add_argument("--input-file", help="Path to the input document (PDF, Word, etc.)")
5349
input_group.add_argument("--input-dir", help="Path to a directory containing supported documents")
5450
parser.add_argument("-o", "--output-dir", required=True, help="Directory to save the converted Markdown file(s)")
55-
parser.add_argument("--insert-into-llm", action="store_true", help="Insert output into LLM")
51+
parser.add_argument("--llm", choices=["none", "openai", "claude"], default="none",
52+
help="Choose LLM client to process output (none, openai, claude)")
5653

5754
args = parser.parse_args()
5855

@@ -64,19 +61,26 @@ def main():
6461
LicenseManager().apply_license()
6562

6663
# Setup LLM credentials only if required
67-
if args.insert_into_llm:
64+
if args.llm == "openai":
6865
ensure_env_variable("OPENAI_API_KEY", "Enter your OpenAI API key: ")
6966
ensure_env_variable("OPENAI_MODEL", "Enter OpenAI model name (default: gpt-4): ", default="gpt-4")
67+
elif args.llm == "claude":
68+
ensure_env_variable("CLAUDE_API_KEY", "Enter your Claude API key: ")
69+
ensure_env_variable("CLAUDE_MODEL", "Enter Claude model name (default: claude-v1): ", default="claude-v1")
7070

71-
# Run conversion for either a single file or a directory
72-
markitdown = MarkItDown(args.output_dir)
71+
# Initialize MarkItDown with selected LLM
72+
llm_client_name = args.llm if args.llm != "none" else None
73+
markitdown = MarkItDown(args.output_dir, llm_client_name)
7374

75+
# Run conversion for either a single file or a directory
7476
if args.input_file:
75-
markitdown.convert_document(args.input_file, args.insert_into_llm)
77+
markitdown.convert_document(args.input_file)
7678
elif args.input_dir:
77-
markitdown.convert_directory(args.input_dir, args.insert_into_llm)
79+
markitdown.convert_directory(args.input_dir)
7880

7981
except Exception as e:
8082
logging.error(f"Error: {e}", exc_info=True)
8183
sys.exit(1)
8284

85+
if __name__ == "__main__":
86+
main()

packages/markitdown/tests/test.py

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import pytest
22
from pathlib import Path
3+
import os
4+
35
from ..src.openize.markitdown.converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
46
from ..src.openize.markitdown.factory import ConverterFactory
5-
from ..src.openize.markitdown.llm_strategy import SaveLocally, InsertIntoLLM
7+
from ..src.openize.markitdown.llm_strategy import SaveLocally, LLMFactory, OpenAIClient, ClaudeClient
68
from ..src.openize.markitdown.processor import DocumentProcessor
7-
import os
9+
810

911
@pytest.fixture
1012
def sample_output_dir():
@@ -18,49 +20,83 @@ def sample_md_file(sample_output_dir):
1820
md_file.write_text("# Sample Markdown File\n\nThis is a test.")
1921
return md_file
2022

21-
# Test Converters
22-
def test_word_converter(sample_output_dir):
23+
24+
# --------- Converter Tests ---------
25+
26+
def test_word_converter():
2327
converter = WordConverter()
2428
assert converter is not None
2529

26-
def test_pdf_converter(sample_output_dir):
30+
def test_pdf_converter():
2731
converter = PDFConverter()
2832
assert converter is not None
2933

30-
def test_excel_converter(sample_output_dir):
34+
def test_excel_converter():
3135
converter = ExcelConverter()
3236
assert converter is not None
3337

34-
def test_ppt_converter(sample_output_dir):
38+
def test_ppt_converter():
3539
converter = PowerPointConverter()
3640
assert converter is not None
3741

38-
# Test ConverterFactory
42+
43+
# --------- Factory Tests ---------
44+
3945
def test_converter_factory():
4046
assert isinstance(ConverterFactory.get_converter(".docx"), WordConverter)
4147
assert isinstance(ConverterFactory.get_converter(".pdf"), PDFConverter)
4248
assert isinstance(ConverterFactory.get_converter(".xlsx"), ExcelConverter)
4349
assert isinstance(ConverterFactory.get_converter(".pptx"), PowerPointConverter)
4450

4551

46-
# Test LLM Strategy
52+
# --------- Strategy Pattern Tests ---------
53+
4754
def test_save_locally(sample_md_file):
4855
strategy = SaveLocally()
4956
strategy.process(sample_md_file)
5057
assert sample_md_file.exists()
5158

52-
def test_insert_into_llm(mocker, sample_md_file):
53-
mocker.patch("openai.ChatCompletion.create", return_value={"choices": [{"message": {"content": "LLM Response"}}]})
54-
strategy = InsertIntoLLM()
59+
def test_insert_into_llm_openai(mocker, sample_md_file):
60+
mocker.patch("openai.ChatCompletion.create", return_value={
61+
"choices": [{"message": {"content": "Mocked OpenAI Response"}}]
62+
})
63+
strategy = OpenAIClient(provider="openai")
64+
strategy.process(sample_md_file)
65+
66+
def test_insert_into_llm_claude(mocker, sample_md_file):
67+
mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
68+
mock_client = mock_anthropic.return_value
69+
mock_client.messages.create.return_value.content = "Mocked Claude Response"
70+
strategy = ClaudeClient(provider="claude")
5571
strategy.process(sample_md_file)
5672

57-
# Test DocumentProcessor
58-
def test_document_processor(mocker, sample_output_dir):
59-
mocker.patch("packages.src.openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
73+
74+
# --------- Document Processor Tests ---------
75+
76+
def test_document_processor_local_conversion(mocker, sample_output_dir):
77+
mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
6078
processor = DocumentProcessor(output_dir=sample_output_dir)
6179
processor.process_document("sample.docx", insert_into_llm=False)
6280
output_file = sample_output_dir / "sample.md"
6381
assert output_file.exists()
6482

65-
if __name__ == "__main__":
66-
pytest.main()
83+
def test_document_processor_with_llm_openai(mocker, sample_output_dir):
84+
mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
85+
mocker.patch("openai.ChatCompletion.create", return_value={
86+
"choices": [{"message": {"content": "LLM Output"}}]
87+
})
88+
processor = DocumentProcessor(output_dir=sample_output_dir)
89+
processor.process_document("sample.docx", insert_into_llm=True, llm_provider="openai")
90+
output_file = sample_output_dir / "sample.md"
91+
assert output_file.exists()
92+
93+
def test_document_processor_with_llm_claude(mocker, sample_output_dir):
94+
mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
95+
mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
96+
mock_client = mock_anthropic.return_value
97+
mock_client.messages.create.return_value.content = "LLM Claude Output"
98+
processor = DocumentProcessor(output_dir=sample_output_dir)
99+
processor.process_document("sample.docx", insert_into_llm=True, llm_provider="claude")
100+
output_file = sample_output_dir / "sample.md"
101+
assert output_file.exists()
102+

0 commit comments

Comments
 (0)