Skip to content

[Fix] Unified LLM Whisperer adapters #144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
6947814
unifyied llmw adapters
jagadeeswaran-zipstack Jan 9, 2025
b771e94
updated read me and unified adapter names
jagadeeswaran-zipstack Jan 13, 2025
5243ab8
env seperation for v1 and v2
jagadeeswaran-zipstack Jan 13, 2025
4f235a5
reverting version updated in init file
jagadeeswaran-zipstack Jan 13, 2025
8ab7447
adapter name change
jagadeeswaran-zipstack Jan 16, 2025
649f586
merge main
jagadeeswaran-zipstack Jan 25, 2025
f66de9b
Merge branch 'main' of github.com:Zipstack/unstract-sdk into fix/unif…
jagadeeswaran-zipstack Jan 25, 2025
299bba7
Merge branch 'main' into fix/unifyinh-llm-whisperer-adapters
jagadeeswaran-zipstack Feb 4, 2025
9be5cc0
migrated to llmw python client
jagadeeswaran-zipstack Feb 6, 2025
23e5cf9
added fix for llmw python client migration
jagadeeswaran-zipstack Feb 6, 2025
9950f40
Merge branch 'fix/unifyinh-llm-whisperer-adapters' of github.com:Zips…
jagadeeswaran-zipstack Feb 6, 2025
50684f0
moved v1 to use API
jagadeeswaran-zipstack Feb 12, 2025
25f918b
Merge branch 'main' of github.com:Zipstack/unstract-sdk into fix/unif…
jagadeeswaran-zipstack Feb 12, 2025
35a0500
fixes on V1 API merge
jagadeeswaran-zipstack Feb 12, 2025
c5f0871
removed json schema json
jagadeeswaran-zipstack Feb 12, 2025
708a8d9
removed json schema json
jagadeeswaran-zipstack Feb 12, 2025
71658f0
added wait_timeout to env
jagadeeswaran-zipstack Feb 12, 2025
ee12702
updated wait_timeout
jagadeeswaran-zipstack Feb 12, 2025
e08e4f8
version update
jagadeeswaran-zipstack Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class OutputModes(Enum):
LINE_PRINTER = "line-printer"
DUMP_TEXT = "dump-text"
TEXT = "text"
LAYOUT_PRESERVING = "layout_preserving"


class HTTPMethod(Enum):
Expand Down Expand Up @@ -48,10 +49,13 @@ class WhispererEnv:
LLMWhisperer's status API. Defaults to 30s
MAX_POLLS: Total number of times to poll the status API.
Set to -1 to poll indefinitely. Defaults to -1
STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API
on failure during polling. Defaults to 5.
"""

POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL"
MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS"
STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES"


class WhispererConfig:
Expand All @@ -66,6 +70,7 @@ class WhispererConfig:
GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius"
FORCE_TEXT_PROCESSING = "force_text_processing"
LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance"
LINE_SPLITTER_STRATEGY = "line_splitter_strategy"
HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor"
PAGES_TO_EXTRACT = "pages_to_extract"
STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting"
Expand All @@ -74,7 +79,12 @@ class WhispererConfig:
PAGE_SEPARATOR = "page_seperator"
MARK_VERTICAL_LINES = "mark_vertical_lines"
MARK_HORIZONTAL_LINES = "mark_horizontal_lines"

URL_IN_POST = "url_in_post"
TAG = "tag"
USE_WEBHOOK = "use_webhook"
WEBHOOK_METADATA = "webhook_metadata"
TEXT_ONLY = "text_only"
VERSION = "version"

class WhisperStatus:
"""Values returned / used by /whisper-status endpoint."""
Expand All @@ -86,6 +96,7 @@ class WhisperStatus:
# Used for async processing
WHISPER_HASH = "whisper-hash"
STATUS = "status"
WHISPER_HASH_V2 = "whisper_hash"


class WhispererDefaults:
Expand All @@ -95,6 +106,7 @@ class WhispererDefaults:
GAUSSIAN_BLUR_RADIUS = 0.0
FORCE_TEXT_PROCESSING = False
LINE_SPLITTER_TOLERANCE = 0.75
LINE_SPLITTER_STRATEGY = "left-priority"
HORIZONTAL_STRETCH_FACTOR = 1.0
POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
Expand All @@ -104,3 +116,7 @@ class WhispererDefaults:
PAGE_SEPARATOR = "<<< >>>"
MARK_VERTICAL_LINES = False
MARK_HORIZONTAL_LINES = False
STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5))
URL_IN_POST = False
TAG = "default"
TEXT_ONLY = False
81 changes: 81 additions & 0 deletions src/unstract/sdk/adapters/x2text/llm_whisperer/src/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import logging
from typing import Any
from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import (
Modes,
OutputModes,
WhispererConfig,
WhispererDefaults,
)
logger = logging.getLogger(__name__)


class LLMWhispererHelper:

@staticmethod
def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]:
"""Gets query params meant for /whisper endpoint.

The params is filled based on the configuration passed.

Returns:
dict[str, Any]: Query params
"""
params = {
WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value),
WhispererConfig.OUTPUT_MODE: config.get(
WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value
),
WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get(
WhispererConfig.LINE_SPLITTER_TOLERANCE,
WhispererDefaults.LINE_SPLITTER_TOLERANCE,
),
WhispererConfig.LINE_SPLITTER_STRATEGY: config.get(
WhispererConfig.LINE_SPLITTER_STRATEGY,
WhispererDefaults.LINE_SPLITTER_STRATEGY,
),
WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get(
WhispererConfig.HORIZONTAL_STRETCH_FACTOR,
WhispererDefaults.HORIZONTAL_STRETCH_FACTOR,
),
WhispererConfig.PAGES_TO_EXTRACT: config.get(
WhispererConfig.PAGES_TO_EXTRACT,
WhispererDefaults.PAGES_TO_EXTRACT,
),
WhispererConfig.MARK_VERTICAL_LINES: config.get(
WhispererConfig.MARK_VERTICAL_LINES,
WhispererDefaults.MARK_VERTICAL_LINES,
),
WhispererConfig.MARK_HORIZONTAL_LINES: config.get(
WhispererConfig.MARK_HORIZONTAL_LINES,
WhispererDefaults.MARK_HORIZONTAL_LINES,
),
WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST,
WhispererConfig.PAGE_SEPARATOR: config.get(
WhispererConfig.PAGE_SEPARATOR,
WhispererDefaults.PAGE_SEPARATOR,
),
# Not providing default value to maintain legacy compatablity
# these are optional params and identifiers for audit
WhispererConfig.TAG: config.get(
WhispererConfig.TAG,
WhispererDefaults.TAG,
),
WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK),
WhispererConfig.WEBHOOK_METADATA: config.get(
WhispererConfig.WEBHOOK_METADATA
),
}
if params[WhispererConfig.MODE] == Modes.LOW_COST.value:
params.update(
{
WhispererConfig.MEDIAN_FILTER_SIZE: config.get(
WhispererConfig.MEDIAN_FILTER_SIZE,
WhispererDefaults.MEDIAN_FILTER_SIZE,
),
WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get(
WhispererConfig.GAUSSIAN_BLUR_RADIUS,
WhispererDefaults.GAUSSIAN_BLUR_RADIUS,
),
}
)
return params
Loading