Skip to content

Commit a2c9b55

Browse files
Publishing for 25.3.4
1 parent e9526d6 commit a2c9b55

File tree

9 files changed

+88
-79
lines changed

9 files changed

+88
-79
lines changed

packages/__init__.py

Whitespace-only changes.
-156 Bytes
Binary file not shown.

packages/markitdown/main.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,17 @@
22
from pathlib import Path
33
from openize.markitdown.processor import DocumentProcessor
44

5-
def run_conversion():
6-
processor = DocumentProcessor()
7-
8-
# Process PDF file
9-
# processor.process_document("file.pdf", insert_into_llm=False)
10-
11-
# Process Word document (.docx)
12-
# processor.process_document("example.docx", insert_into_llm=False)
135

14-
# Process PowerPoint file (.pptx)
15-
# processor.process_document("presentation.pptx", insert_into_llm=False)
166

17-
# Process Excel file (.xlsx)
18-
# processor.process_document("data.xlsx", insert_into_llm=False)
7+
def run_conversion(input_file):
8+
processor = DocumentProcessor()
9+
processor.process_document(input_file, insert_into_llm=False)
10+
print(f"Conversion completed: {input_file}")
1911

2012
if __name__ == "__main__":
21-
run_conversion()
13+
parser = argparse.ArgumentParser(description="Convert documents to Markdown.")
14+
parser.add_argument("input_file", type=str, help="Path to the input file")
15+
args = parser.parse_args()
16+
17+
run_conversion(args.input_file)
2218

packages/markitdown/setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = openize-markitdown
3-
version = 25.3.2
3+
version = 25.3.4
44
author = Openize
55
author_email = packages@openize.com
66
description = A document converter for Word, PDF, Excel, and PowerPoint to Markdown.

packages/markitdown/src/openize/markitdown/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
(.docx, .pdf, .xlsx, .pptx) to Markdown format.
77
"""
88

9-
__version__ = "0.1.0"
9+
__version__ = "25.3.4"
1010

1111
from .processor import DocumentProcessor
1212
from .converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
1313
from .factory import ConverterFactory
1414
from .llm_strategy import SaveLocally, InsertIntoLLM
15+
from .license_manager import LicenseManager
1516

1617
__all__ = [
1718
'DocumentProcessor',
@@ -22,4 +23,5 @@
2223
'ConverterFactory',
2324
'SaveLocally',
2425
'InsertIntoLLM',
26+
'LicenseManager',
2527
]

packages/markitdown/src/openize/markitdown/config.py

Lines changed: 0 additions & 53 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import configparser
2+
3+
def get_config():
4+
"""Reads settings from setup.cfg"""
5+
config = configparser.ConfigParser()
6+
config.read("../../setup.cfg")
7+
return config
8+
9+
def get_license_path():
10+
"""Fetches Aspose license path from setup.cfg or uses a default."""
11+
config = get_config()
12+
return config.get("aspose", "license_path", fallback="Aspose.Total.lic")
13+
14+
def use_aspose_license():
15+
"""Checks if Aspose license should be applied (True/False)."""
16+
config = get_config()
17+
return config.getboolean("aspose", "use_aspose_license", fallback=True) # Defaults to True

packages/markitdown/src/openize/markitdown/converters.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,49 +5,61 @@
55
import aspose.words as aw
66
import aspose.cells as ac
77
import aspose.slides as asl
8-
from config import Config
9-
108

119
from abc import ABC, abstractmethod
1210

1311
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
1412

15-
1613
class DocumentConverter(ABC):
1714
@abstractmethod
18-
def convert_to_md(self, input_path, output_dir): pass
15+
def convert_to_md(self, input_path, output_dir):
16+
pass
1917

2018
@staticmethod
2119
def clean_text(text):
22-
return re.sub(r"<[^>]*>", "", text).replace("&nbsp;", " ").strip()
20+
# Remove HTML tags and normalize spaces
21+
text = re.sub(r"<[^>]*>", "", text).replace("&nbsp;", " ").strip()
22+
text = re.sub(r"\s+", " ", text) # Normalize whitespace
23+
return text
2324

2425
class WordConverter(DocumentConverter):
2526
def convert_to_md(self, input_path, output_dir):
2627
try:
27-
doc = aw.Document(str(input_path))
28-
output_file = output_dir / (input_path.stem + ".md")
28+
doc = aw.Document(input_path.resolve())
29+
output_file = output_dir / f"{input_path.stem}.md"
2930
doc.save(str(output_file), aw.SaveFormat.MARKDOWN)
3031
return output_file
32+
except FileNotFoundError:
33+
logging.error(f"File not found: {input_path}")
34+
except aw.FileFormatException:
35+
logging.error(f"Invalid Word file format: {input_path}")
3136
except Exception as e:
3237
logging.error(f"Error converting {input_path}: {e}")
3338

3439
class PDFConverter(DocumentConverter):
3540
def convert_to_md(self, input_path, output_dir):
3641
try:
37-
doc = aw.Document(str(input_path))
38-
output_file = output_dir / (input_path.stem + ".md")
42+
doc = aw.Document(input_path.resolve())
43+
output_file = output_dir / f"{input_path.stem}.md"
3944
doc.save(str(output_file), aw.SaveFormat.MARKDOWN)
4045
return output_file
46+
except FileNotFoundError:
47+
logging.error(f"File not found: {input_path}")
48+
except aw.FileFormatException:
49+
logging.error(f"Invalid PDF file format: {input_path}")
4150
except Exception as e:
4251
logging.error(f"Error converting {input_path}: {e}")
43-
4452
class ExcelConverter(DocumentConverter):
4553
def convert_to_md(self, input_path, output_dir):
4654
try:
4755
workbook = ac.Workbook(str(input_path))
4856
output_file = output_dir / (input_path.stem + ".md")
4957
workbook.save(str(output_file), ac.SaveFormat.MARKDOWN)
5058
return output_file
59+
except FileNotFoundError:
60+
logging.error(f"File not found: {input_path}")
61+
except ac.CellsException:
62+
logging.error(f"Invalid Excel file format: {input_path}")
5163
except Exception as e:
5264
logging.error(f"Error converting {input_path}: {e}")
5365

@@ -58,6 +70,9 @@ def convert_to_md(self, input_path, output_dir):
5870
output_file = output_dir / (input_path.stem + ".md")
5971
presentation.save(str(output_file), asl.export.SaveFormat.MD)
6072
return output_file
73+
except FileNotFoundError:
74+
logging.error(f"File not found: {input_path}")
75+
except asl.PptxReadException:
76+
logging.error(f"Invalid PowerPoint file format: {input_path}")
6177
except Exception as e:
6278
logging.error(f"Error converting {input_path}: {e}")
63-
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import aspose.words as aw
2+
import aspose.cells as ac
3+
import aspose.slides as asl
4+
import logging
5+
from config_loader import get_license_path, use_aspose_license
6+
7+
class LicenseManager:
8+
"""Manages the application of Aspose licenses for different file format APIs."""
9+
10+
def __init__(self):
11+
"""Initialize LicenseManager with license settings."""
12+
self.license_path = get_license_path()
13+
self.use_license = use_aspose_license()
14+
15+
def apply_license(self):
16+
"""Apply the Aspose license if enabled in setup.cfg."""
17+
if not self.use_license:
18+
logging.info("Aspose license is disabled in setup.cfg. Running in free mode.")
19+
return
20+
21+
try:
22+
license_files = [aw.License(), ac.License(), asl.License()]
23+
for license in license_files:
24+
license.set_license(self.license_path)
25+
logging.info(f"Aspose license applied from {self.license_path}")
26+
except Exception as e:
27+
logging.warning(f"Failed to apply Aspose license: {e}")
28+
29+
# Usage example:
30+
# license_manager = LicenseManager()
31+
# license_manager.apply_license()
32+

0 commit comments

Comments
 (0)