Merge pull request #2 from openize-com/muhammadumar-patch

muhammadumargroupdocs · web-flow · commit 54db5f681214 · 2025-05-21T16:58:37.000+05:00
MarkItDown: OpenAI and Claude Strategy Integration
diff --git a/packages/markitdown/setup.cfg b/packages/markitdown/setup.cfg
@@ -1,11 +1,12 @@
 
 [metadata]
 name = openize-markitdown-python
-version = 25.4.0
+version = 25.5.0
+
 author = Openize
 author_email = packages@openize.com
 description = A document converter for Word, PDF, Excel, and PowerPoint to Markdown.
-long_description = file:README.md  
+long_description = file:README.md
 long_description_content_type = text/markdown
 license = MIT
 license_files = LICENSE
@@ -24,14 +25,15 @@ classifiers =
 
 [options]
 package_dir =
-    = src  
+    = src
 packages = find_namespace:
 python_requires = >=3.7
 install_requires =
     aspose-words>=23.0.0
     aspose-cells-python>=23.0.0
     aspose-slides>=23.0.0
     openai>=1.0.0
+    anthropic>=3.0.0
 
 [options.packages.find]
 where = src 
diff --git a/packages/markitdown/setup.py b/packages/markitdown/setup.py
@@ -18,7 +18,9 @@ def install_if_missing(package, module_name=None):
 dependencies = [
     ("aspose-cells-python", "asposecellspython"),
     ("aspose-words", "asposewords"),
-    ("aspose-slides", "asposeslides")
+    ("aspose-slides", "asposeslides"),
+    ("openai", "openai"),
+    ("anthropic", "anthropic"),
 ]
 
 # Install missing dependencies before proceeding
diff --git a/packages/markitdown/src/openize/markitdown/core.py b/packages/markitdown/src/openize/markitdown/core.py
@@ -1,20 +1,34 @@
 import os
-
 from processor import DocumentProcessor
-
+from llm_strategy import LLMFactory, SaveLocally
+import logging
 
 class MarkItDown:
-    def __init__(self, output_dir):
+    def __init__(self, output_dir, llm_client_name=None):
         self.output_dir = output_dir
+        self.llm_client_name = llm_client_name
+        self.llm_client = None
 
-    def convert_document(self, input_file, insert_into_llm=False):
+        if llm_client_name:
+            try:
+                self.llm_client = LLMFactory.get_llm(llm_client_name)
+            except ValueError as e:
+                logging.error(f"LLM client error: {e}")
+                self.llm_client = SaveLocally()
+        else:
+            self.llm_client = SaveLocally()
+
+    def convert_document(self, input_file):
         """Run the document conversion process."""
         processor = DocumentProcessor(self.output_dir)
-        processor.process_document(input_file, insert_into_llm)
+        md_file = processor.process_document(input_file)
+
+        if md_file and self.llm_client:
+            self.llm_client.process(md_file)
 
-    def convert_directory(self, input_dir: str, insert_into_llm: bool = False):
+    def convert_directory(self, input_dir: str):
         supported_exts = [".docx", ".pdf", ".xlsx", ".pptx"]
         for filename in os.listdir(input_dir):
             filepath = os.path.join(input_dir, filename)
             if os.path.isfile(filepath) and os.path.splitext(filename)[1].lower() in supported_exts:
-                self.convert_document(filepath, insert_into_llm)
+                self.convert_document(filepath)
diff --git a/packages/markitdown/src/openize/markitdown/llm_strategy.py b/packages/markitdown/src/openize/markitdown/llm_strategy.py
@@ -3,6 +3,9 @@
 from abc import ABC, abstractmethod
 import openai
 
+# Placeholder for Claude SDK import
+# from claude_sdk import ClaudeClient as ClaudeAPIClient
+
 class LLMStrategy(ABC):
     @abstractmethod
     def process(self, md_file):
@@ -12,10 +15,10 @@ class SaveLocally(LLMStrategy):
     def process(self, md_file):
         logging.info(f"File saved locally: {md_file}")
 
-class InsertIntoLLM(LLMStrategy):
+class OpenAIClient(LLMStrategy):
     def __init__(self):
-        self.api_key = os.getenv("OPENAI_API_KEY")  # Read from environment
-        self.model = os.getenv("OPENAI_MODEL", "gpt-4")  # Default model if not set
+        self.api_key = os.getenv("OPENAI_API_KEY")
+        self.model = os.getenv("OPENAI_MODEL", "gpt-4")
 
         if not self.api_key:
             raise ValueError("Missing OpenAI API key. Please set it in the environment.")
@@ -40,11 +43,51 @@ def process(self, md_file):
             )
 
             llm_response = response.choices[0].message.content
-            logging.info(f"LLM Response for {md_file}: {llm_response}")
+            logging.info(f"OpenAI Response for {md_file}: {llm_response}")
 
         except FileNotFoundError:
             logging.error(f"Markdown file not found: {md_file}")
         except openai.OpenAIError as e:
             logging.error(f"OpenAI API error while processing {md_file}: {e}")
         except Exception as e:
             logging.exception(f"Unexpected error processing {md_file}: {e}")
+
+class ClaudeClient(LLMStrategy):
+    def __init__(self):
+        self.api_key = os.getenv("CLAUDE_API_KEY")
+        self.model = os.getenv("CLAUDE_MODEL", "claude-v1")
+
+        if not self.api_key:
+            raise ValueError("Missing Claude API key. Please set it in the environment.")
+
+        # Initialize Claude client here (replace with actual SDK code)
+        # self.client = ClaudeAPIClient(api_key=self.api_key)
+
+    def process(self, md_file):
+        try:
+            with open(md_file, "r", encoding="utf-8") as file:
+                content = file.read()
+
+            # Replace with actual Claude API call
+            # response = self.client.complete(prompt=content, model=self.model)
+
+            # Dummy placeholder response
+            response_text = f"Simulated Claude response for {md_file}"
+
+            logging.info(f"Claude Response for {md_file}: {response_text}")
+
+        except FileNotFoundError:
+            logging.error(f"Markdown file not found: {md_file}")
+        except Exception as e:
+            logging.exception(f"Unexpected error processing {md_file}: {e}")
+
+class LLMFactory:
+    @staticmethod
+    def get_llm(client_name: str) -> LLMStrategy:
+        client_name = client_name.lower()
+        if client_name == "openai":
+            return OpenAIClient()
+        elif client_name == "claude":
+            return ClaudeClient()
+        else:
+            raise ValueError(f"Unknown LLM client: {client_name}")
diff --git a/packages/markitdown/src/openize/markitdown/main.py b/packages/markitdown/src/openize/markitdown/main.py
@@ -5,7 +5,6 @@
 from core import MarkItDown
 from license_manager import LicenseManager
 
-
 def ask_user_boolean(question):
     """Ask the user a yes/no question and return True/False."""
     while True:
@@ -17,7 +16,6 @@ def ask_user_boolean(question):
         else:
             print("Invalid input. Please enter 'yes' or 'no'.")
 
-
 def ensure_env_variable(var_name, prompt_message, default=None):
     """Ensure an environment variable is set, otherwise ask the user and persist it."""
     value = os.getenv(var_name)
@@ -31,7 +29,6 @@ def ensure_env_variable(var_name, prompt_message, default=None):
 
     return value
 
-
 def set_env_variable(var_name, value):
     """Set an environment variable persistently on Windows and Linux/macOS."""
     os.environ[var_name] = value  # Set for the current session
@@ -42,7 +39,6 @@ def set_env_variable(var_name, value):
         os.system(f'echo "export {var_name}={value}" >> ~/.bashrc')
         os.system(f'echo "export {var_name}={value}" >> ~/.profile')
 
-
 def main():
     """Entry point for the CLI tool."""
     logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -52,7 +48,8 @@ def main():
     input_group.add_argument("--input-file", help="Path to the input document (PDF, Word, etc.)")
     input_group.add_argument("--input-dir", help="Path to a directory containing supported documents")
     parser.add_argument("-o", "--output-dir", required=True, help="Directory to save the converted Markdown file(s)")
-    parser.add_argument("--insert-into-llm", action="store_true", help="Insert output into LLM")
+    parser.add_argument("--llm", choices=["none", "openai", "claude"], default="none",
+                        help="Choose LLM client to process output (none, openai, claude)")
 
     args = parser.parse_args()
 
@@ -64,19 +61,26 @@ def main():
                 LicenseManager().apply_license()
 
         # Setup LLM credentials only if required
-        if args.insert_into_llm:
+        if args.llm == "openai":
             ensure_env_variable("OPENAI_API_KEY", "Enter your OpenAI API key: ")
             ensure_env_variable("OPENAI_MODEL", "Enter OpenAI model name (default: gpt-4): ", default="gpt-4")
+        elif args.llm == "claude":
+            ensure_env_variable("CLAUDE_API_KEY", "Enter your Claude API key: ")
+            ensure_env_variable("CLAUDE_MODEL", "Enter Claude model name (default: claude-v1): ", default="claude-v1")
 
-        # Run conversion for either a single file or a directory
-        markitdown = MarkItDown(args.output_dir)
+        # Initialize MarkItDown with selected LLM
+        llm_client_name = args.llm if args.llm != "none" else None
+        markitdown = MarkItDown(args.output_dir, llm_client_name)
 
+        # Run conversion for either a single file or a directory
         if args.input_file:
-            markitdown.convert_document(args.input_file, args.insert_into_llm)
+            markitdown.convert_document(args.input_file)
         elif args.input_dir:
-            markitdown.convert_directory(args.input_dir, args.insert_into_llm)
+            markitdown.convert_directory(args.input_dir)
 
     except Exception as e:
         logging.error(f"Error: {e}", exc_info=True)
         sys.exit(1)
 
+if __name__ == "__main__":
+    main()
diff --git a/packages/markitdown/tests/test.py b/packages/markitdown/tests/test.py
@@ -1,10 +1,12 @@
 import pytest
 from pathlib import Path
+import os
+
 from ..src.openize.markitdown.converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
 from ..src.openize.markitdown.factory import ConverterFactory
-from ..src.openize.markitdown.llm_strategy import SaveLocally, InsertIntoLLM
+from ..src.openize.markitdown.llm_strategy import SaveLocally, LLMFactory, OpenAIClient, ClaudeClient
 from ..src.openize.markitdown.processor import DocumentProcessor
-import os
+
 
 @pytest.fixture
 def sample_output_dir():
@@ -18,49 +20,83 @@ def sample_md_file(sample_output_dir):
     md_file.write_text("# Sample Markdown File\n\nThis is a test.")
     return md_file
 
-# Test Converters
-def test_word_converter(sample_output_dir):
+
+# --------- Converter Tests ---------
+
+def test_word_converter():
     converter = WordConverter()
     assert converter is not None
 
-def test_pdf_converter(sample_output_dir):
+def test_pdf_converter():
     converter = PDFConverter()
     assert converter is not None
 
-def test_excel_converter(sample_output_dir):
+def test_excel_converter():
     converter = ExcelConverter()
     assert converter is not None
 
-def test_ppt_converter(sample_output_dir):
+def test_ppt_converter():
     converter = PowerPointConverter()
     assert converter is not None
 
-# Test ConverterFactory
+
+# --------- Factory Tests ---------
+
 def test_converter_factory():
     assert isinstance(ConverterFactory.get_converter(".docx"), WordConverter)
     assert isinstance(ConverterFactory.get_converter(".pdf"), PDFConverter)
     assert isinstance(ConverterFactory.get_converter(".xlsx"), ExcelConverter)
     assert isinstance(ConverterFactory.get_converter(".pptx"), PowerPointConverter)
 
 
-# Test LLM Strategy
+# --------- Strategy Pattern Tests ---------
+
 def test_save_locally(sample_md_file):
     strategy = SaveLocally()
     strategy.process(sample_md_file)
     assert sample_md_file.exists()
 
-def test_insert_into_llm(mocker, sample_md_file):
-    mocker.patch("openai.ChatCompletion.create", return_value={"choices": [{"message": {"content": "LLM Response"}}]})
-    strategy = InsertIntoLLM()
+def test_insert_into_llm_openai(mocker, sample_md_file):
+    mocker.patch("openai.ChatCompletion.create", return_value={
+        "choices": [{"message": {"content": "Mocked OpenAI Response"}}]
+    })
+    strategy = OpenAIClient(provider="openai")
+    strategy.process(sample_md_file)
+
+def test_insert_into_llm_claude(mocker, sample_md_file):
+    mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
+    mock_client = mock_anthropic.return_value
+    mock_client.messages.create.return_value.content = "Mocked Claude Response"
+    strategy = ClaudeClient(provider="claude")
     strategy.process(sample_md_file)
 
-# Test DocumentProcessor
-def test_document_processor(mocker, sample_output_dir):
-    mocker.patch("packages.src.openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
+
+# --------- Document Processor Tests ---------
+
+def test_document_processor_local_conversion(mocker, sample_output_dir):
+    mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
     processor = DocumentProcessor(output_dir=sample_output_dir)
     processor.process_document("sample.docx", insert_into_llm=False)
     output_file = sample_output_dir / "sample.md"
     assert output_file.exists()
 
-if __name__ == "__main__":
-    pytest.main()
+def test_document_processor_with_llm_openai(mocker, sample_output_dir):
+    mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
+    mocker.patch("openai.ChatCompletion.create", return_value={
+        "choices": [{"message": {"content": "LLM Output"}}]
+    })
+    processor = DocumentProcessor(output_dir=sample_output_dir)
+    processor.process_document("sample.docx", insert_into_llm=True, llm_provider="openai")
+    output_file = sample_output_dir / "sample.md"
+    assert output_file.exists()
+
+def test_document_processor_with_llm_claude(mocker, sample_output_dir):
+    mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
+    mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
+    mock_client = mock_anthropic.return_value
+    mock_client.messages.create.return_value.content = "LLM Claude Output"
+    processor = DocumentProcessor(output_dir=sample_output_dir)
+    processor.process_document("sample.docx", insert_into_llm=True, llm_provider="claude")
+    output_file = sample_output_dir / "sample.md"
+    assert output_file.exists()
+