From 1da908892be2db1fbb19a25b6b0c84641c7868ad Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 20:57:44 -0700
Subject: [PATCH 01/38] add time

---
 aws/lambda/oss-ci-job-queue-time/.gitignore   |   3 +
 aws/lambda/oss-ci-job-queue-time/Makefile     |  19 ++
 .../oss_ci_job_queue_time.py                  | 237 ++++++++++++++++++
 .../oss-ci-job-queue-time/requirements.txt    |   2 +
 .../test_oss_ci_job_queue_time.py             | 231 +++++++++++++++++
 5 files changed, 492 insertions(+)
 create mode 100644 aws/lambda/oss-ci-job-queue-time/.gitignore
 create mode 100644 aws/lambda/oss-ci-job-queue-time/Makefile
 create mode 100644 aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py
 create mode 100644 aws/lambda/oss-ci-job-queue-time/requirements.txt
 create mode 100644 aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py

diff --git a/aws/lambda/oss-ci-job-queue-time/.gitignore b/aws/lambda/oss-ci-job-queue-time/.gitignore
new file mode 100644
index 0000000000..bd92f6376a
--- /dev/null
+++ b/aws/lambda/oss-ci-job-queue-time/.gitignore
@@ -0,0 +1,3 @@
+*.zip
+deployment/
+venv/
diff --git a/aws/lambda/oss-ci-job-queue-time/Makefile b/aws/lambda/oss-ci-job-queue-time/Makefile
new file mode 100644
index 0000000000..ce75d870cc
--- /dev/null
+++ b/aws/lambda/oss-ci-job-queue-time/Makefile
@@ -0,0 +1,19 @@
+all: run-local
+
+clean:
+	rm -rf deployment
+	rm -rf venv
+	rm -rf deployment.zip
+
+venv/bin/python:
+	virtualenv venv
+	venv/bin/pip install -r requirements.txt
+
+deployment.zip:
+	mkdir -p deployment
+	cp oss_ci_job_queue_time.py ./deployment/.
+	pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade
+	cd ./deployment && zip -q -r ../deployment.zip .
+
+.PHONY: create-deployment-package
+create-deployment-package: deployment.zip
diff --git a/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py
new file mode 100644
index 0000000000..3aed64ed5c
--- /dev/null
+++ b/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py
@@ -0,0 +1,237 @@
+from functools import lru_cache
+import json
+from typing import Any
+import clickhouse_connect
+import os
+import boto3
+import argparse
+from logging import info
+import logging
+import io
+import gzip
+
+logging.basicConfig(level=logging.INFO)
+
+CLICKHOUSE_ENDPOINT = os.getenv("CLICKHOUSE_ENDPOINT", "")
+CLICKHOUSE_USERNAME = os.getenv("CLICKHOUSE_USERNAME", "default")
+CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
+
+
+@lru_cache()
+def get_clickhouse_client(host: str, user: str, password: str) -> Any:
+    return clickhouse_connect.get_client(
+        host=host, user=user, password=password, secure=True
+    )
+
+
+@lru_cache()
+def get_aws_s3_resource() -> Any:
+    return boto3.resource("s3")
+
+
+def get_clickhouse_client_handler() -> Any:
+    for env in ["CLICKHOUSE_ENDPOINT", "CLICKHOUSE_USERNAME", "CLICKHOUSE_PASSWORD"]:
+        if not os.getenv(env):
+            raise ValueError(f"Missing environment variable {env}")
+
+    return get_clickhouse_client(
+        host=CLICKHOUSE_ENDPOINT, user=CLICKHOUSE_USERNAME, password=CLICKHOUSE_PASSWORD
+    )
+
+
+def upload_to_s3_txt(
+    s3_client: Any,
+    bucket_name: str,
+    key: str,
+    records: list[dict[str, Any]],
+) -> None:
+    info(f"Writing {len(records)} documents to S3 {bucket_name}/{key}")
+    body = io.StringIO()
+    for record in records:
+        json.dump(record, body)
+        body.write("\n")
+
+    s3_client.Object(
+        f"{bucket_name}",
+        f"{key}",
+    ).put(
+        Body=gzip.compress(body.getvalue().encode()),
+        ContentEncoding="gzip",
+        ContentType="text/plain",
+    )
+    info(f"Done! Finish writing document to S3 {bucket_name}/{key} ")
+
+
+def query_in_queue_jobs_now() -> str:
+    query = """
+    WITH possible_queued_jobs AS (
+        SELECT
+            id,
+            run_id
+        FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
+        WHERE
+            status = 'queued'
+            AND created_at < (CURRENT_TIMESTAMP() - INTERVAL 5 MINUTE)
+            AND created_at > (CURRENT_TIMESTAMP() - INTERVAL 1 WEEK)
+    )
+    SELECT
+        DATE_DIFF(
+            'second',
+            job.created_at,
+            CURRENT_TIMESTAMP()
+        ) AS queue_s,
+        workflow.repository.'full_name' AS repo,
+        workflow.name AS workflow_name,
+        job.name AS job_name,
+        job.html_url,
+        IF(
+            LENGTH(job.labels) = 0,
+            'N/A',
+            IF(
+                LENGTH(job.labels) > 1,
+                job.labels[2],
+                job.labels[1]
+            )
+        ) AS machine_type,
+        toUnixTimestamp(CURRENT_TIMESTAMP()) AS time
+    FROM
+        default.workflow_job job FINAL
+    JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id
+    WHERE
+        job.id IN (SELECT id FROM possible_queued_jobs)
+        AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
+        AND workflow.repository.'full_name' = 'pytorch/pytorch'
+        AND job.status = 'queued'
+        AND LENGTH(job.steps) = 0
+        AND workflow.status != 'completed'
+    ORDER BY
+        queue_s DESC    """
+    return query
+
+
+class QueueTimeProcessor:
+    """
+    this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time
+
+    To run the main method:
+       processor = QueueTimeProcessor(clickhouse_client,s3_client)
+       processor.process()
+    """
+
+    def __init__(
+        self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False
+    ) -> None:
+        self.clickhouse_client = clickhouse_client
+        self.s3_client = s3_client
+        self.is_dry_run = is_dry_run
+
+    def process(self) -> None:
+        self.proceses_job_queue_times_historical()
+
+    def proceses_job_queue_times_historical(self) -> None:
+        jobs_in_queue = self.get_jobs_in_queue_now()
+
+        if len(jobs_in_queue) == 0:
+            info("No jobs in queue now, skipping writing to s3")
+            return
+
+        info(f"Found {len(jobs_in_queue)} jobs in queue now")
+        info(f"Peeking data: {jobs_in_queue[0]}")
+
+        bucket_name = "ossci-raw-job-status"
+        repo = jobs_in_queue[0]["repo"]
+        time = jobs_in_queue[0]["time"]
+
+        key = f"job_queue_times_historical/{repo}/{time}.txt"
+
+        if self.is_dry_run:
+            info(
+                f"[Dry Run Mode]: {len(jobs_in_queue)} records to S3 {bucket_name}/{key}"
+            )
+            info(json.dumps(jobs_in_queue, indent=4))
+            return
+
+        upload_to_s3_txt(self.s3_client, bucket_name, key, jobs_in_queue)
+
+    def get_jobs_in_queue_now(self) -> list[dict[str, Any]]:
+        reader = self.clickhouse_client.query(query_in_queue_jobs_now())
+        # clickhouse returns a generator to return column names and rows
+        # see https://clickhouse.com/docs/integrations/python#the-queryresult-object
+        column_names = reader.column_names
+        rows = reader.result_rows
+        res = self._to_query_result_dict(rows, column_names)
+        return res
+
+    def _to_query_result_dict(
+        self, rows: list[Any], column_names: list[str]
+    ) -> list[dict[str, Any]]:
+        li = []
+        for row in rows:
+            record = {}
+            for idx, name in enumerate(column_names):
+                record[name] = row[idx]
+            li.append(record)
+        return li
+
+
+def lambda_handler(event: Any, context: Any) -> None:
+    """
+    Main method to run in aws lambda environment
+    """
+    db_client = get_clickhouse_client_handler()
+    s3_client = get_aws_s3_resource()
+
+    QueueTimeProcessor(db_client, s3_client).process()
+
+    return
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command line arguments, this is mainly used for local test environment.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--clickhouse_endpoint",
+        type=str,
+        required=True,
+        help="the clickhouse endpoint, the clickhouse_endpoint name is  https://{clickhouse_endpoint}:{port} for full url ",
+    )
+    parser.add_argument(
+        "--clickhouse_username", type=str, required=True, help="the clickhouse username"
+    )
+    parser.add_argument(
+        "--clickhouse_password",
+        type=str,
+        required=True,
+        help="the clickhouse password for the user name",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="when set true, only print instead of writing results to s3",
+    )
+    return parser.parse_known_args()[0]
+
+
+def main(*args, **kwargs) -> None:
+    """
+    Main method to run in local test environment
+    """
+    args = parse_args()
+
+    db_client = get_clickhouse_client(
+        host=args.clickhouse_endpoint,
+        user=args.clickhouse_username,
+        password=args.clickhouse_password,
+    )
+    s3_client = get_aws_s3_resource()
+
+    # process the queue time events
+    QueueTimeProcessor(db_client, s3_client, is_dry_run=True).process()
+
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/aws/lambda/oss-ci-job-queue-time/requirements.txt b/aws/lambda/oss-ci-job-queue-time/requirements.txt
new file mode 100644
index 0000000000..7a4ec20f9f
--- /dev/null
+++ b/aws/lambda/oss-ci-job-queue-time/requirements.txt
@@ -0,0 +1,2 @@
+clickhouse_connect==0.8.5
+pytest==7.4.0
diff --git a/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py
new file mode 100644
index 0000000000..c50fae5a90
--- /dev/null
+++ b/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py
@@ -0,0 +1,231 @@
+import unittest
+import os
+import json
+from re import M, T
+from typing import Any, Dict, List, Tuple
+from unittest import mock
+from unittest.mock import MagicMock, patch
+from oss_ci_job_queue_time import (
+    lambda_handler,
+    get_aws_s3_resource,
+    get_clickhouse_client,
+)
+import gzip
+
+
+def get_default_result_rows(test_sample: str = "0"):
+    """
+    generate result rows for testing, this corrresponds to the following columns:
+       'queue_s', 'repo', 'workflow_name', 'job_name', 'html_url', 'machine_type', 'time'
+    """
+    match test_sample:
+        case "0":
+            return [
+                (
+                    60000,
+                    "pytorch/pytorch",
+                    "workflow-name-1",
+                    "job-name-1",
+                    "runs/1/job/1",
+                    "linux.aws.h100",
+                    1742262372,
+                ),
+                (
+                    1400,
+                    "pytorch/pytorch",
+                    "workflow-name-2",
+                    "job-name-2",
+                    "runs/2/job/2",
+                    "linux.rocm.gpu.2",
+                    1742262372,
+                ),
+            ]
+        case "1":
+            return [
+                (
+                    60000,
+                    "pytorch/pytorch",
+                    "inductor-h100",
+                    "test1 (h100, 5, 5, linux.aws.h100)",
+                    "runs/1/job/1",
+                    "linux.aws.h100",
+                    1742262372,
+                ),
+                (
+                    50000,
+                    "pytorch/pytorch",
+                    "inductor-h100",
+                    "test1 (h100, 5, 5, linux.aws.h100)",
+                    "runs/1/job/2",
+                    "linux.aws.h100",
+                    1742262372,
+                ),
+                (
+                    55000,
+                    "pytorch/pytorch",
+                    "inductor-h100",
+                    "test1 (h100, 2, 6, linux.aws.h100)",
+                    "runs/1/job/3",
+                    "linux.aws.h100",
+                    1742262372,
+                ),
+                (
+                    1729,
+                    "pytorch/pytorch",
+                    "inductor-h100",
+                    "test2 (h100, 1, 1, linux.aws.h100)",
+                    "runs/2/job/1",
+                    "linux.aws.h100",
+                    1742262372,
+                ),
+                (
+                    1352,
+                    "pytorch/pytorch",
+                    "inductor-rocm",
+                    "rocm-test1(1, 1, linux.rocm.gpu.2)",
+                    "runs/3/job/1",
+                    "linux.rocm.gpu.2",
+                    1742262372,
+                ),
+                (
+                    1400,
+                    "pytorch/pytorch",
+                    "inductor-rocm",
+                    "rocm-test1 (1, 1, linux.rocm.gpu.2)",
+                    "runs/4/job/2",
+                    "linux.rocm.gpu.2",
+                    1742262372,
+                ),
+            ]
+        case _:
+            return []
+
+
+def get_default_result_columns() -> Tuple:
+    return (
+        "queue_s",
+        "repo",
+        "workflow_name",
+        "job_name",
+        "html_url",
+        "machine_type",
+        "time",
+    )
+
+
+def mock_s3_resource_put(mock_s3_resource: Any) -> None:
+    mock_s3 = mock_s3_resource.return_value
+    mock_object = mock_s3.Object.return_value
+    mock_object.put.return_value = {"ResponseMetadata": {"HTTPStatusCode": 200}}
+
+
+def get_mock_s3_resource_object(mock_s3_resource: Any):
+    return mock_s3_resource.return_value.Object
+
+
+def mock_db_client(
+    mock: Any,
+    result_rows: List[Tuple] = get_default_result_rows(),
+    result_columns: Tuple = get_default_result_columns(),
+) -> None:
+    mock_client = mock.return_value
+    mock_client.query.return_value.result_rows = result_rows
+    mock_client.query.return_value.column_names = result_columns
+
+
+def setEnvironmentVariables():
+    os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1"
+    os.environ["CLICKHOUSE_USERNAME"] = "user1"
+    os.environ["CLICKHOUSE_PASSWORD"] = "pwd1"
+
+
+class Test(unittest.TestCase):
+    @patch("oss_ci_job_queue_time.get_aws_s3_resource")
+    @patch("oss_ci_job_queue_time.get_clickhouse_client")
+    def test_lambda_handler_when_row_result_is_empty(
+        self, mock_get_client, mock_s3_resource
+    ):
+        print("test_lambda_handler_when_row_result_is_empty ")
+        # prepare
+        setEnvironmentVariables()
+        mock_s3_resource_put(mock_s3_resource)
+        mock_db_client(mock_get_client, result_rows=[])
+
+        # execute
+        lambda_handler(None, None)
+
+        # assert
+        mock_get_client.assert_called_once()
+        get_mock_s3_resource_object(
+            mock_s3_resource
+        ).return_value.put.assert_not_called()
+
+    @patch("oss_ci_job_queue_time.get_aws_s3_resource")
+    @patch("oss_ci_job_queue_time.get_clickhouse_client")
+    def test_lambda_handler_when_lambda_happy_flow_then_success(
+        self, mock_get_client, mock_s3_resource
+    ):
+        # prepare
+        setEnvironmentVariables()
+        mock_s3_resource_put(mock_s3_resource)
+        mock_db_client(mock_get_client)
+
+        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372}\n'
+        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372}\n'
+        expected_s3_body = expected_r1 + expected_r2
+        expect = gzip.compress(expected_s3_body)
+
+        # execute
+        lambda_handler(None, None)
+
+        # assert
+
+        # assert clickhouse client
+        mock_get_client.assert_called_once()
+        mock_get_client.return_value.query.assert_called_once()
+
+        # assert s3 resource
+        mock_s3_resource.assert_called_once()
+        get_mock_s3_resource_object(
+            mock_s3_resource
+        ).return_value.put.assert_called_once()
+        get_mock_s3_resource_object(
+            mock_s3_resource
+        ).return_value.put.assert_called_once_with(
+            Body=expect, ContentEncoding="gzip", ContentType="text/plain"
+        )
+
+    @patch("boto3.resource")
+    @patch("clickhouse_connect.get_client")
+    def test_lambda_handler_when_missing_required_env_vars_then_throws_error(
+        self, mock_get_client, mock_s3_resource
+    ):
+        test_cases = [
+            ("CLICKHOUSE_ENDPOINT"),
+            ("CLICKHOUSE_USERNAME"),
+            ("CLICKHOUSE_PASSWORD"),
+        ]
+
+        for x in test_cases:
+            with self.subTest(x=x):
+                # prepare
+                mock_get_client.reset_mock(return_value=True)
+                mock_s3_resource.reset_mock(return_value=True)
+
+                setEnvironmentVariables()
+                os.environ[x] = ""
+
+                # execute
+                with self.assertRaises(ValueError) as context:
+                    _ = lambda_handler(None, None)
+
+                # assert
+                self.assertTrue(x in str(context.exception))
+                mock_get_client.return_value.query.assert_not_called()
+                get_mock_s3_resource_object(
+                    mock_s3_resource
+                ).return_value.put.assert_not_called()
+
+
+if __name__ == "__main__":
+    unittest.main()

From a774afb57ac213567b90167d0fe057b116ba9be1 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 21:10:10 -0700
Subject: [PATCH 02/38] rename function for consistency

---
 aws/lambda/oss-ci-job-queue-time/Makefile                       | 2 +-
 .../{oss_ci_job_queue_time.py => lambda_function.py}            | 0
 aws/lambda/oss-ci-job-queue-time/requirements.txt               | 1 +
 .../{test_oss_ci_job_queue_time.py => test_lambda_function.py}  | 2 +-
 4 files changed, 3 insertions(+), 2 deletions(-)
 rename aws/lambda/oss-ci-job-queue-time/{oss_ci_job_queue_time.py => lambda_function.py} (100%)
 rename aws/lambda/oss-ci-job-queue-time/{test_oss_ci_job_queue_time.py => test_lambda_function.py} (99%)

diff --git a/aws/lambda/oss-ci-job-queue-time/Makefile b/aws/lambda/oss-ci-job-queue-time/Makefile
index ce75d870cc..478548770a 100644
--- a/aws/lambda/oss-ci-job-queue-time/Makefile
+++ b/aws/lambda/oss-ci-job-queue-time/Makefile
@@ -11,7 +11,7 @@ venv/bin/python:
 
 deployment.zip:
 	mkdir -p deployment
-	cp oss_ci_job_queue_time.py ./deployment/.
+	cp lambda_function.py ./deployment/.
 	pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade
 	cd ./deployment && zip -q -r ../deployment.zip .
 
diff --git a/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
similarity index 100%
rename from aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py
rename to aws/lambda/oss-ci-job-queue-time/lambda_function.py
diff --git a/aws/lambda/oss-ci-job-queue-time/requirements.txt b/aws/lambda/oss-ci-job-queue-time/requirements.txt
index 7a4ec20f9f..800c7ac1cb 100644
--- a/aws/lambda/oss-ci-job-queue-time/requirements.txt
+++ b/aws/lambda/oss-ci-job-queue-time/requirements.txt
@@ -1,2 +1,3 @@
 clickhouse_connect==0.8.5
+boto3==1.35.33
 pytest==7.4.0
diff --git a/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
similarity index 99%
rename from aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py
rename to aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
index c50fae5a90..926348b425 100644
--- a/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py
+++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, Tuple
 from unittest import mock
 from unittest.mock import MagicMock, patch
-from oss_ci_job_queue_time import (
+from lambda_function import (
     lambda_handler,
     get_aws_s3_resource,
     get_clickhouse_client,

From 32f77b8b6fda234e42d39836a61860a66092e822 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 21:11:12 -0700
Subject: [PATCH 03/38] replace mock patch

---
 aws/lambda/oss-ci-job-queue-time/test_lambda_function.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
index 926348b425..2c3f7dfdae 100644
--- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
+++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
@@ -140,8 +140,8 @@ def setEnvironmentVariables():
 
 
 class Test(unittest.TestCase):
-    @patch("oss_ci_job_queue_time.get_aws_s3_resource")
-    @patch("oss_ci_job_queue_time.get_clickhouse_client")
+    @patch("lambda_function.get_aws_s3_resource")
+    @patch("lambda_function.get_clickhouse_client")
     def test_lambda_handler_when_row_result_is_empty(
         self, mock_get_client, mock_s3_resource
     ):
@@ -160,8 +160,8 @@ def test_lambda_handler_when_row_result_is_empty(
             mock_s3_resource
         ).return_value.put.assert_not_called()
 
-    @patch("oss_ci_job_queue_time.get_aws_s3_resource")
-    @patch("oss_ci_job_queue_time.get_clickhouse_client")
+    @patch("lambda_function.get_aws_s3_resource")
+    @patch("lambda_function.get_clickhouse_client")
     def test_lambda_handler_when_lambda_happy_flow_then_success(
         self, mock_get_client, mock_s3_resource
     ):

From 8fcc0e792e741e48a36a7f72d83400596e93a35f Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 21:25:24 -0700
Subject: [PATCH 04/38] replace mock patch

---
 .lintrunner.toml                              |   1 +
 .../oss-ci-job-queue-time/lambda_function.py  |  15 +-
 .../test_lambda_function.py                   | 172 ++++++++----------
 3 files changed, 89 insertions(+), 99 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 25b15bcf9d..71a7e7722c 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -43,6 +43,7 @@ include_patterns = [
     'torchci/**/*.py',
     'torchci/**/*.pyi',
     '.github/scripts/*.py',
+    'aws/lambda/oss-ci-job-queue-time/*.py',
     'aws/lambda/whl_metadata_upload_pep658/**/*.py',
 ]
 command = [
diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
index 3aed64ed5c..743419954e 100644
--- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py
+++ b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 from functools import lru_cache
 import json
 from typing import Any
@@ -211,19 +212,21 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="when set true, only print instead of writing results to s3",
     )
-    return parser.parse_known_args()[0]
+    args,_ = parser.parse_known_args()
+    return args
 
 
-def main(*args, **kwargs) -> None:
+def main() -> None:
     """
     Main method to run in local test environment
     """
-    args = parse_args()
+
+    arguments = parse_args()
 
     db_client = get_clickhouse_client(
-        host=args.clickhouse_endpoint,
-        user=args.clickhouse_username,
-        password=args.clickhouse_password,
+        host=arguments.clickhouse_endpoint,
+        user=arguments.clickhouse_username,
+        password=arguments.clickhouse_password,
     )
     s3_client = get_aws_s3_resource()
 
diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
index 2c3f7dfdae..f819127307 100644
--- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
+++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
@@ -18,99 +18,85 @@ def get_default_result_rows(test_sample: str = "0"):
     generate result rows for testing, this corrresponds to the following columns:
        'queue_s', 'repo', 'workflow_name', 'job_name', 'html_url', 'machine_type', 'time'
     """
-    match test_sample:
-        case "0":
-            return [
-                (
-                    60000,
-                    "pytorch/pytorch",
-                    "workflow-name-1",
-                    "job-name-1",
-                    "runs/1/job/1",
-                    "linux.aws.h100",
-                    1742262372,
-                ),
-                (
-                    1400,
-                    "pytorch/pytorch",
-                    "workflow-name-2",
-                    "job-name-2",
-                    "runs/2/job/2",
-                    "linux.rocm.gpu.2",
-                    1742262372,
-                ),
-            ]
-        case "1":
-            return [
-                (
-                    60000,
-                    "pytorch/pytorch",
-                    "inductor-h100",
-                    "test1 (h100, 5, 5, linux.aws.h100)",
-                    "runs/1/job/1",
-                    "linux.aws.h100",
-                    1742262372,
-                ),
-                (
-                    50000,
-                    "pytorch/pytorch",
-                    "inductor-h100",
-                    "test1 (h100, 5, 5, linux.aws.h100)",
-                    "runs/1/job/2",
-                    "linux.aws.h100",
-                    1742262372,
-                ),
-                (
-                    55000,
-                    "pytorch/pytorch",
-                    "inductor-h100",
-                    "test1 (h100, 2, 6, linux.aws.h100)",
-                    "runs/1/job/3",
-                    "linux.aws.h100",
-                    1742262372,
-                ),
-                (
-                    1729,
-                    "pytorch/pytorch",
-                    "inductor-h100",
-                    "test2 (h100, 1, 1, linux.aws.h100)",
-                    "runs/2/job/1",
-                    "linux.aws.h100",
-                    1742262372,
-                ),
-                (
-                    1352,
-                    "pytorch/pytorch",
-                    "inductor-rocm",
-                    "rocm-test1(1, 1, linux.rocm.gpu.2)",
-                    "runs/3/job/1",
-                    "linux.rocm.gpu.2",
-                    1742262372,
-                ),
-                (
-                    1400,
-                    "pytorch/pytorch",
-                    "inductor-rocm",
-                    "rocm-test1 (1, 1, linux.rocm.gpu.2)",
-                    "runs/4/job/2",
-                    "linux.rocm.gpu.2",
-                    1742262372,
-                ),
-            ]
-        case _:
-            return []
-
-
-def get_default_result_columns() -> Tuple:
-    return (
-        "queue_s",
-        "repo",
-        "workflow_name",
-        "job_name",
-        "html_url",
-        "machine_type",
-        "time",
-    )
+    if (test_sample == "0"):
+        return [
+            (
+                60000,
+                "pytorch/pytorch",
+                "workflow-name-1",
+                "job-name-1",
+                "runs/1/job/1",
+                "linux.aws.h100",
+                1742262372,
+            ),
+            (
+                1400,
+                "pytorch/pytorch",
+                "workflow-name-2",
+                "job-name-2",
+                "runs/2/job/2"
+            )]
+
+    return [
+        (
+            60000,
+            "pytorch/pytorch",
+            "inductor-h100",
+            "test1 (h100, 5, 5, linux.aws.h100)",
+            "runs/1/job/1",
+            "linux.aws.h100",
+            1742262372,
+        ),
+        (
+            50000,
+            "pytorch/pytorch",
+            "inductor-h100",
+            "test1 (h100, 5, 5, linux.aws.h100)",
+            "runs/1/job/2",
+            "linux.aws.h100",
+            1742262372,
+        ),
+        (
+            55000,
+            "pytorch/pytorch",
+            "inductor-h100",
+            "test1 (h100, 2, 6, linux.aws.h100)",
+            "runs/1/job/3",
+            "linux.aws.h100",
+            1742262372,
+        ),
+        (
+            1729,
+            "pytorch/pytorch",
+            "inductor-h100",
+            "test2 (h100, 1, 1, linux.aws.h100)",
+            "runs/2/job/1",
+            "linux.aws.h100",
+            1742262372,
+        ),
+        (
+            1352,
+            "pytorch/pytorch",
+            "inductor-rocm",
+            "rocm-test1(1, 1, linux.rocm.gpu.2)",
+            "runs/3/job/1",
+            "linux.rocm.gpu.2",
+            1742262372,
+        ),
+        (
+            1400,
+            "pytorch/pytorch",
+            "inductor-rocm",
+            "rocm-test1 (1, 1, linux.rocm.gpu.2)",
+            "runs/4/job/2",
+            "linux.rocm.gpu.2",
+            1742262372,
+        ),
+    ]
+
+
+def get_default_result_columns():
+    return "queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time"
 
 
 def mock_s3_resource_put(mock_s3_resource: Any) -> None:

From 871a64678bea05c2891a5e6075fa741c2774fe33 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 21:28:19 -0700
Subject: [PATCH 05/38] replace mock patch

---
 aws/lambda/oss-ci-job-queue-time/lambda_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
index 743419954e..f0fc532bac 100644
--- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py
+++ b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
@@ -4,7 +4,7 @@
 from typing import Any
 import clickhouse_connect
 import os
-import boto3
+import boto3 # type: ignore[import]
 import argparse
 from logging import info
 import logging

From c2267bce66d134bfd2e5a9c99e31ab67d0856500 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 21:31:03 -0700
Subject: [PATCH 06/38] replace mock patch

---
 .../oss-ci-job-queue-time/lambda_function.py  | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
index f0fc532bac..35814f2012 100644
--- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py
+++ b/aws/lambda/oss-ci-job-queue-time/lambda_function.py
@@ -1,16 +1,19 @@
 #!/usr/bin/env python
-from functools import lru_cache
-import json
-from typing import Any
-import clickhouse_connect
-import os
-import boto3 # type: ignore[import]
 import argparse
-from logging import info
-import logging
 import io
+import json
+import logging
+import os
 import gzip
 
+import boto3  # type: ignore[import]
+import clickhouse_connect
+
+# Local imports
+from functools import lru_cache
+from logging import info
+from typing import Any
+
 logging.basicConfig(level=logging.INFO)
 
 CLICKHOUSE_ENDPOINT = os.getenv("CLICKHOUSE_ENDPOINT", "")

From 35289aa85fa491c4dadbdda0a1843cc11ac57aa3 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 21:33:49 -0700
Subject: [PATCH 07/38] fix test

---
 .../test_lambda_function.py                    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
index f819127307..f8dc4e6f83 100644
--- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
+++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
@@ -1,16 +1,14 @@
 import unittest
 import os
-import json
-from re import M, T
-from typing import Any, Dict, List, Tuple
-from unittest import mock
-from unittest.mock import MagicMock, patch
+import gzip
+
+from typing import Any, List, Tuple
+from unittest.mock import patch
 from lambda_function import (
     lambda_handler,
     get_aws_s3_resource,
     get_clickhouse_client,
 )
-import gzip
 
 
 def get_default_result_rows(test_sample: str = "0"):
@@ -34,7 +32,9 @@ def get_default_result_rows(test_sample: str = "0"):
                 "pytorch/pytorch",
                 "workflow-name-2",
                 "job-name-2",
-                "runs/2/job/2"
+                "runs/2/job/2",
+                "linux.rocm.gpu.2",
+                1742262372,
             )]
 
     return [
@@ -95,8 +95,8 @@ def get_default_result_rows(test_sample: str = "0"):
     ]
 
 
-def get_default_result_columns():
-    return "queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time"
+def get_default_result_columns() -> Tuple:
+    return ("queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time")
 
 
 def mock_s3_resource_put(mock_s3_resource: Any) -> None:

From 52dcbced3d9945bb89b5796e8ebde34fdc1c7f7d Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 22:52:17 -0700
Subject: [PATCH 08/38] fix test

---
 .github/workflows/tests.yml                   | 20 +++++++++++++++++++
 .../.gitignore                                |  0
 .../Makefile                                  |  0
 .../lambda_function.py                        |  1 +
 .../requirements.txt                          |  0
 aws/lambda/tests/__init__.py                  |  2 ++
 .../test_lambda_oss_ci_job_queue_time.py}     | 10 +++++-----
 7 files changed, 28 insertions(+), 5 deletions(-)
 rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/.gitignore (100%)
 rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/Makefile (100%)
 rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/lambda_function.py (99%)
 rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/requirements.txt (100%)
 create mode 100644 aws/lambda/tests/__init__.py
 rename aws/lambda/{oss-ci-job-queue-time/test_lambda_function.py => tests/test_lambda_oss_ci_job_queue_time.py} (94%)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e2d8059c97..d9147b47a2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -39,6 +39,26 @@ jobs:
         # Test tools
         python3 -m unittest discover -vs tools/tests -p 'test_*.py'
 
+  test-aws-lambda:
+    name: Test github scripts
+    if: ${{ github.repository == 'pytorch/test-infra' }}
+    uses: ./.github/workflows/linux_job_v2.yml
+    with:
+      docker-image: python:3.10-slim-bullseye
+      runner: linux.large
+      script: |
+        # Environment setup
+        echo ::group::setup Python environment
+        python -m venv .venv/
+        source .venv/bin/activate
+        pip install pip==23.0.1 pytest==7.2.0 \
+          jsonschema==4.17.3 numpy==1.24.1 pandas==2.1.4 boto3==1.19.12 \
+          clickhouse-connect==0.8.14
+        echo ::endgroup::
+
+        # Test aws lambda
+        pytest -v aws/lambda/tests
+
   test-github-scripts:
     name: Test github scripts
     if: ${{ github.repository == 'pytorch/test-infra' }}
diff --git a/aws/lambda/oss-ci-job-queue-time/.gitignore b/aws/lambda/oss_ci_job_queue_time/.gitignore
similarity index 100%
rename from aws/lambda/oss-ci-job-queue-time/.gitignore
rename to aws/lambda/oss_ci_job_queue_time/.gitignore
diff --git a/aws/lambda/oss-ci-job-queue-time/Makefile b/aws/lambda/oss_ci_job_queue_time/Makefile
similarity index 100%
rename from aws/lambda/oss-ci-job-queue-time/Makefile
rename to aws/lambda/oss_ci_job_queue_time/Makefile
diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
similarity index 99%
rename from aws/lambda/oss-ci-job-queue-time/lambda_function.py
rename to aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 35814f2012..3c3150e15d 100644
--- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import gzip
+import sys
 
 import boto3  # type: ignore[import]
 import clickhouse_connect
diff --git a/aws/lambda/oss-ci-job-queue-time/requirements.txt b/aws/lambda/oss_ci_job_queue_time/requirements.txt
similarity index 100%
rename from aws/lambda/oss-ci-job-queue-time/requirements.txt
rename to aws/lambda/oss_ci_job_queue_time/requirements.txt
diff --git a/aws/lambda/tests/__init__.py b/aws/lambda/tests/__init__.py
new file mode 100644
index 0000000000..10abe4c5f4
--- /dev/null
+++ b/aws/lambda/tests/__init__.py
@@ -0,0 +1,2 @@
+# tests/__init__.py
+# This file can be left empty or contain initialization code for the test suite.
diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
similarity index 94%
rename from aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
rename to aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index f8dc4e6f83..c3f394c162 100644
--- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -4,7 +4,7 @@
 
 from typing import Any, List, Tuple
 from unittest.mock import patch
-from lambda_function import (
+from oss_ci_job_queue_time.lambda_function import (
     lambda_handler,
     get_aws_s3_resource,
     get_clickhouse_client,
@@ -126,8 +126,8 @@ def setEnvironmentVariables():
 
 
 class Test(unittest.TestCase):
-    @patch("lambda_function.get_aws_s3_resource")
-    @patch("lambda_function.get_clickhouse_client")
+    @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
+    @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
     def test_lambda_handler_when_row_result_is_empty(
         self, mock_get_client, mock_s3_resource
     ):
@@ -146,8 +146,8 @@ def test_lambda_handler_when_row_result_is_empty(
             mock_s3_resource
         ).return_value.put.assert_not_called()
 
-    @patch("lambda_function.get_aws_s3_resource")
-    @patch("lambda_function.get_clickhouse_client")
+    @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
+    @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
     def test_lambda_handler_when_lambda_happy_flow_then_success(
         self, mock_get_client, mock_s3_resource
     ):

From b6022f379b8463268e43f9a55925b72caaaf53aa Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 22:53:10 -0700
Subject: [PATCH 09/38] fix test

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d9147b47a2..6676a822d3 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -40,7 +40,7 @@ jobs:
         python3 -m unittest discover -vs tools/tests -p 'test_*.py'
 
   test-aws-lambda:
-    name: Test github scripts
+    name: Test aws lambda
     if: ${{ github.repository == 'pytorch/test-infra' }}
     uses: ./.github/workflows/linux_job_v2.yml
     with:

From b1064c71cdddc22c26197b5ee1047ce87e84251c Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 22:55:14 -0700
Subject: [PATCH 10/38] fix test

---
 .github/workflows/tests.yml                       | 5 ++---
 aws/lambda/oss_ci_job_queue_time/requirements.txt | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6676a822d3..ca143a76ee 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -51,9 +51,8 @@ jobs:
         echo ::group::setup Python environment
         python -m venv .venv/
         source .venv/bin/activate
-        pip install pip==23.0.1 pytest==7.2.0 \
-          jsonschema==4.17.3 numpy==1.24.1 pandas==2.1.4 boto3==1.19.12 \
-          clickhouse-connect==0.8.14
+        pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 \
+          clickhouse-connect==0.8.5
         echo ::endgroup::
 
         # Test aws lambda
diff --git a/aws/lambda/oss_ci_job_queue_time/requirements.txt b/aws/lambda/oss_ci_job_queue_time/requirements.txt
index 800c7ac1cb..3e22fde96f 100644
--- a/aws/lambda/oss_ci_job_queue_time/requirements.txt
+++ b/aws/lambda/oss_ci_job_queue_time/requirements.txt
@@ -1,3 +1,2 @@
 clickhouse_connect==0.8.5
 boto3==1.35.33
-pytest==7.4.0

From 7a1b5aad3bc5fde4df8cb47f9ec8d62a534d8ff1 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Mon, 17 Mar 2025 22:55:42 -0700
Subject: [PATCH 11/38] fix test version

---
 .github/workflows/tests.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ca143a76ee..819f028e41 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -51,8 +51,7 @@ jobs:
         echo ::group::setup Python environment
         python -m venv .venv/
         source .venv/bin/activate
-        pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 \
-          clickhouse-connect==0.8.5
+        pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 clickhouse-connect==0.8.5
         echo ::endgroup::
 
         # Test aws lambda

From 3baa9209323721d2c7ff12b2fb702c696ff7b7da Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 13:51:58 -0700
Subject: [PATCH 12/38] fix test version

---
 .../oss_ci_job_queue_time/lambda_function.py  | 53 ++++++++++---------
 .../test_lambda_oss_ci_job_queue_time.py      |  8 +--
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 3c3150e15d..16fb868552 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -17,15 +17,11 @@
 
 logging.basicConfig(level=logging.INFO)
 
-CLICKHOUSE_ENDPOINT = os.getenv("CLICKHOUSE_ENDPOINT", "")
-CLICKHOUSE_USERNAME = os.getenv("CLICKHOUSE_USERNAME", "default")
-CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
-
 
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
     return clickhouse_connect.get_client(
-        host=host, user=user, password=password, secure=True
+        host=host, user=user, password=password, secure=True, verify=False
     )
 
 
@@ -34,13 +30,15 @@ def get_aws_s3_resource() -> Any:
     return boto3.resource("s3")
 
 
-def get_clickhouse_client_handler() -> Any:
+def get_clickhouse_client_environment() -> Any:
     for env in ["CLICKHOUSE_ENDPOINT", "CLICKHOUSE_USERNAME", "CLICKHOUSE_PASSWORD"]:
         if not os.getenv(env):
             raise ValueError(f"Missing environment variable {env}")
 
     return get_clickhouse_client(
-        host=CLICKHOUSE_ENDPOINT, user=CLICKHOUSE_USERNAME, password=CLICKHOUSE_PASSWORD
+        host=os.getenv("CLICKHOUSE_ENDPOINT"),
+        user=os.getenv("CLICKHOUSE_USERNAME"),
+        password=os.getenv("CLICKHOUSE_PASSWORD"),
     )
 
 
@@ -183,7 +181,7 @@ def lambda_handler(event: Any, context: Any) -> None:
     """
     Main method to run in aws lambda environment
     """
-    db_client = get_clickhouse_client_handler()
+    db_client = get_clickhouse_client_environment()
     s3_client = get_aws_s3_resource()
 
     QueueTimeProcessor(db_client, s3_client).process()
@@ -197,47 +195,50 @@ def parse_args() -> argparse.Namespace:
     """
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--clickhouse_endpoint",
+        "--clickhouse-endpoint",
+        default=os.getenv("CLICKHOUSE_ENDPOINT", ""),
         type=str,
-        required=True,
         help="the clickhouse endpoint, the clickhouse_endpoint name is  https://{clickhouse_endpoint}:{port} for full url ",
     )
     parser.add_argument(
-        "--clickhouse_username", type=str, required=True, help="the clickhouse username"
+        "--clickhouse-username",
+        type=str,
+        default=os.getenv("CLICKHOUSE_USERNAME", ""),
+        help="the clickhouse username",
     )
     parser.add_argument(
-        "--clickhouse_password",
+        "--clickhouse-password",
         type=str,
-        required=True,
+        default=os.getenv("CLICKHOUSE_PASSWORD", ""),
         help="the clickhouse password for the user name",
     )
     parser.add_argument(
-        "--dry-run",
+        "--not-dry-run",
         action="store_true",
-        help="when set true, only print instead of writing results to s3",
+        help="when set true, writing results to s3 from local . By default, local run is dry run mode",
     )
-    args,_ = parser.parse_known_args()
+    args, _ = parser.parse_known_args()
     return args
 
 
 def main() -> None:
     """
-    Main method to run in local test environment
+    method to run in local test environment
     """
 
     arguments = parse_args()
 
-    db_client = get_clickhouse_client(
-        host=arguments.clickhouse_endpoint,
-        user=arguments.clickhouse_username,
-        password=arguments.clickhouse_password,
-    )
-    s3_client = get_aws_s3_resource()
+    # update environment variables for input parameters
+    os.environ["CLICKHOUSE_ENDPOINT"] = arguments.clickhouse_endpoint
+    os.environ["CLICKHOUSE_USERNAME"] = arguments.clickhouse_username
+    os.environ["CLICKHOUSE_PASSWORD"] = arguments.clickhouse_password
 
-    # process the queue time events
-    QueueTimeProcessor(db_client, s3_client, is_dry_run=True).process()
+    db_client = get_clickhouse_client_environment()
+    s3_client = get_aws_s3_resource()
 
-    return
+    # always run in dry run mode in local test environment, unless it's disabled.
+    is_dry_run = not arguments.not_dry_run
+    QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process()
 
 
 if __name__ == "__main__":
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index c3f394c162..c1d7258b4a 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -119,7 +119,7 @@ def mock_db_client(
     mock_client.query.return_value.column_names = result_columns
 
 
-def setEnvironmentVariables():
+def set_default_env_variables():
     os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1"
     os.environ["CLICKHOUSE_USERNAME"] = "user1"
     os.environ["CLICKHOUSE_PASSWORD"] = "pwd1"
@@ -133,7 +133,7 @@ def test_lambda_handler_when_row_result_is_empty(
     ):
         print("test_lambda_handler_when_row_result_is_empty ")
         # prepare
-        setEnvironmentVariables()
+        set_default_env_variables()
         mock_s3_resource_put(mock_s3_resource)
         mock_db_client(mock_get_client, result_rows=[])
 
@@ -152,7 +152,7 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(
         self, mock_get_client, mock_s3_resource
     ):
         # prepare
-        setEnvironmentVariables()
+        set_default_env_variables()
         mock_s3_resource_put(mock_s3_resource)
         mock_db_client(mock_get_client)
 
@@ -198,7 +198,7 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(
                 mock_get_client.reset_mock(return_value=True)
                 mock_s3_resource.reset_mock(return_value=True)
 
-                setEnvironmentVariables()
+                set_default_env_variables()
                 os.environ[x] = ""
 
                 # execute

From ca243764f603cf8f27ee81a065adcb750d41d86b Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 20:05:09 -0700
Subject: [PATCH 13/38] fix sync

---
 .../oss_ci_job_queue_time/lambda_function.py  | 184 +++++++++++++-----
 torchci/pages/api/clickhouse/[queryName].ts   |   1 +
 2 files changed, 131 insertions(+), 54 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 16fb868552..f36a016692 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -5,10 +5,10 @@
 import logging
 import os
 import gzip
-import sys
 
 import boto3  # type: ignore[import]
 import clickhouse_connect
+from datetime import datetime, time
 
 # Local imports
 from functools import lru_cache
@@ -17,6 +17,34 @@
 
 logging.basicConfig(level=logging.INFO)
 
+_bucket_name = "ossci-raw-job-status"
+_in_queue_job_select_statement = """
+SELECT
+    DATE_DIFF(
+        'second',
+        job.created_at,
+        {timestamp:DateTime}
+    ) AS queue_s,
+    workflow.repository.'full_name' AS repo,
+    workflow.name AS workflow_name,
+    job.name AS job_name,
+    job.html_url,
+    IF(
+        LENGTH(job.labels) = 0,
+        'N/A',
+        IF(
+            LENGTH(job.labels) > 1,
+            job.labels[2],
+            job.labels[1]
+        )
+    ) AS machine_type,
+    toUnixTimestamp({timestamp:DateTime}) AS time,
+    toUnixTimestamp(job.started_at) as started_at,
+    toUnixTimestamp(job.created_at) as created_at
+FROM
+    default.workflow_job job FINAL
+    JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id
+"""
 
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
@@ -64,53 +92,78 @@ def upload_to_s3_txt(
     )
     info(f"Done! Finish writing document to S3 {bucket_name}/{key} ")
 
+def query_picked_up_job_for_given_snapshot(time:str,repo: str = 'pytorch/pytorch'):
+    """
+     this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later
+    """
+    s1 = """
+    WITH possible_queued_jobs AS (
+        SELECT
+            id,
+            run_id,
+            started_at,
+            created_at
+        FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
+        WHERE
+            started_at > ({timestamp:DateTime})
+            AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE)
+            AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
+    )"""
+
+    s2 = """
+    WHERE
+        job.id IN (SELECT id FROM possible_queued_jobs)
+        AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
+        AND workflow.repository.'full_name' = {repo:String}
+        AND job.status = 'completed'
+        AND LENGTH(job.steps) != 0
+        AND workflow.status = 'completed'
+    ORDER BY
+        queue_s DESC
+    """
+    query = s1 + _in_queue_job_select_statement + s2
+
+    parameters={
+        'timestamp': time ,
+        'repo': repo,
+    }
+    return query,parameters
 
-def query_in_queue_jobs_now() -> str:
-    query = """
+def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch'):
+    """
+        this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
+    """
+    s1 = """
     WITH possible_queued_jobs AS (
         SELECT
             id,
-            run_id
+            run_id,
+            started_at,
+            created_at
         FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
         WHERE
             status = 'queued'
-            AND created_at < (CURRENT_TIMESTAMP() - INTERVAL 5 MINUTE)
-            AND created_at > (CURRENT_TIMESTAMP() - INTERVAL 1 WEEK)
+            AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE)
+            AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
     )
-    SELECT
-        DATE_DIFF(
-            'second',
-            job.created_at,
-            CURRENT_TIMESTAMP()
-        ) AS queue_s,
-        workflow.repository.'full_name' AS repo,
-        workflow.name AS workflow_name,
-        job.name AS job_name,
-        job.html_url,
-        IF(
-            LENGTH(job.labels) = 0,
-            'N/A',
-            IF(
-                LENGTH(job.labels) > 1,
-                job.labels[2],
-                job.labels[1]
-            )
-        ) AS machine_type,
-        toUnixTimestamp(CURRENT_TIMESTAMP()) AS time
-    FROM
-        default.workflow_job job FINAL
-    JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id
+    """
+    s2 ="""
     WHERE
         job.id IN (SELECT id FROM possible_queued_jobs)
         AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
-        AND workflow.repository.'full_name' = 'pytorch/pytorch'
+        AND workflow.repository.'full_name' = {repo:String}
         AND job.status = 'queued'
         AND LENGTH(job.steps) = 0
         AND workflow.status != 'completed'
     ORDER BY
-        queue_s DESC    """
-    return query
-
+        queue_s DESC
+    """
+    query = s1 + _in_queue_job_select_statement + s2
+    parameters={
+        'timestamp': time ,
+        'repo': repo,
+    }
+    return query, parameters
 
 class QueueTimeProcessor:
     """
@@ -120,7 +173,6 @@ class QueueTimeProcessor:
        processor = QueueTimeProcessor(clickhouse_client,s3_client)
        processor.process()
     """
-
     def __init__(
         self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False
     ) -> None:
@@ -131,33 +183,57 @@ def __init__(
     def process(self) -> None:
         self.proceses_job_queue_times_historical()
 
-    def proceses_job_queue_times_historical(self) -> None:
-        jobs_in_queue = self.get_jobs_in_queue_now()
+    def proceses_job_queue_times_historical(self, timestamp:str = "", repo: str = 'pytorch/pytorch') -> None:
+        # by default, we use current time as snapshot
+        snapshot_time = str(int(datetime.now().timestamp()))
+        if timestamp:
+            snapshot_time = timestamp
 
-        if len(jobs_in_queue) == 0:
-            info("No jobs in queue now, skipping writing to s3")
-            return
 
-        info(f"Found {len(jobs_in_queue)} jobs in queue now")
-        info(f"Peeking data: {jobs_in_queue[0]}")
+        # fetches jobs that were in queue in given snapshot time, that are not being picked up by workers
+        queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot(timestamp,repo)
+        jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters)
 
-        bucket_name = "ossci-raw-job-status"
-        repo = jobs_in_queue[0]["repo"]
-        time = jobs_in_queue[0]["time"]
+        # fetches jobs that were in queue in given snapshot time, but were picked up by workers later of given snapshot time
+        picked_query, picked_params = query_picked_up_job_for_given_snapshot(timestamp,repo)
+        jobs_pick = self.process_in_queue_jobs(picked_query, picked_params)
 
-        key = f"job_queue_times_historical/{repo}/{time}.txt"
+        datetime_str = datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
+        print(datetime_str,timestamp,len(jobs_in_queue),len(jobs_pick),)
+
+        info(f"Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later")
+        if len(jobs_in_queue) == 0 and len(jobs_pick) == 0:
+            info(f"No jobs in queue at time {datetime_str}, skipping mutation to S3")
+            return
 
+        key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
+        result = jobs_in_queue + jobs_pick
         if self.is_dry_run:
             info(
-                f"[Dry Run Mode]: {len(jobs_in_queue)} records to S3 {bucket_name}/{key}"
+                f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}"
             )
-            info(json.dumps(jobs_in_queue, indent=4))
+            print(json.dumps(result))
             return
 
-        upload_to_s3_txt(self.s3_client, bucket_name, key, jobs_in_queue)
-
-    def get_jobs_in_queue_now(self) -> list[dict[str, Any]]:
-        reader = self.clickhouse_client.query(query_in_queue_jobs_now())
+        upload_to_s3_txt(self.s3_client, _bucket_name, key, result)
+
+    def process_in_queue_jobs(self, queryStr:str, parameters:Any) -> list[dict[str, Any]]:
+        """
+        post query process to remove duplicated jobs
+        this is bc clickhouse client returns duplicated jobs for some reason
+        """
+        seen = set()
+        db_resp  = self.query(queryStr, parameters)
+        result = []
+        for record in db_resp:
+            if record['html_url']in seen:
+                continue
+            seen.add(record['html_url'])
+            result.append(record)
+        return result
+
+    def query(self, query, params={}) -> list[dict[str, Any]]:
+        reader = self.clickhouse_client.query(query, params)
         # clickhouse returns a generator to return column names and rows
         # see https://clickhouse.com/docs/integrations/python#the-queryresult-object
         column_names = reader.column_names
@@ -215,7 +291,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--not-dry-run",
         action="store_true",
-        help="when set true, writing results to s3 from local . By default, local run is dry run mode",
+        help="when set, writing results to s3 from local environment. By default, we run in dry-run mode for local environment",
     )
     args, _ = parser.parse_known_args()
     return args
@@ -236,10 +312,10 @@ def main() -> None:
     db_client = get_clickhouse_client_environment()
     s3_client = get_aws_s3_resource()
 
-    # always run in dry run mode in local test environment, unless it's disabled.
+    # always run in dry-run mode in local environment, unless it's disabled.
     is_dry_run = not arguments.not_dry_run
-    QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process()
 
+    QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process()
 
 if __name__ == "__main__":
     main()
diff --git a/torchci/pages/api/clickhouse/[queryName].ts b/torchci/pages/api/clickhouse/[queryName].ts
index e0461e5982..01c4f0f51d 100644
--- a/torchci/pages/api/clickhouse/[queryName].ts
+++ b/torchci/pages/api/clickhouse/[queryName].ts
@@ -10,5 +10,6 @@ export default async function handler(
     queryName,
     JSON.parse(req.query.parameters as string)
   );
+
   res.status(200).json(response);
 }

From c64762f4531f1ddcdff4419fac232ea4665012c2 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 20:59:40 -0700
Subject: [PATCH 14/38] fix sync

---
 .../oss_ci_job_queue_time/lambda_function.py  | 76 +++++++++++--------
 .../test_lambda_oss_ci_job_queue_time.py      | 30 +++++---
 2 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index f36a016692..e26e3fc9b7 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -39,13 +39,12 @@
         )
     ) AS machine_type,
     toUnixTimestamp({timestamp:DateTime}) AS time,
-    toUnixTimestamp(job.started_at) as started_at,
-    toUnixTimestamp(job.created_at) as created_at
 FROM
     default.workflow_job job FINAL
     JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id
 """
 
+
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
     return clickhouse_connect.get_client(
@@ -92,9 +91,10 @@ def upload_to_s3_txt(
     )
     info(f"Done! Finish writing document to S3 {bucket_name}/{key} ")
 
-def query_picked_up_job_for_given_snapshot(time:str,repo: str = 'pytorch/pytorch'):
+
+def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"):
     """
-     this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later
+    this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later
     """
     s1 = """
     WITH possible_queued_jobs AS (
@@ -123,15 +123,16 @@ def query_picked_up_job_for_given_snapshot(time:str,repo: str = 'pytorch/pytorch
     """
     query = s1 + _in_queue_job_select_statement + s2
 
-    parameters={
-        'timestamp': time ,
-        'repo': repo,
+    parameters = {
+        "timestamp": time,
+        "repo": repo,
     }
-    return query,parameters
+    return query, parameters
 
-def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch'):
+
+def query_in_queue_jobs_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"):
     """
-        this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
+    this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
     """
     s1 = """
     WITH possible_queued_jobs AS (
@@ -147,7 +148,7 @@ def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch
             AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
     )
     """
-    s2 ="""
+    s2 = """
     WHERE
         job.id IN (SELECT id FROM possible_queued_jobs)
         AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
@@ -159,12 +160,13 @@ def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch
         queue_s DESC
     """
     query = s1 + _in_queue_job_select_statement + s2
-    parameters={
-        'timestamp': time ,
-        'repo': repo,
+    parameters = {
+        "timestamp": time,
+        "repo": repo,
     }
     return query, parameters
 
+
 class QueueTimeProcessor:
     """
     this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time
@@ -173,6 +175,7 @@ class QueueTimeProcessor:
        processor = QueueTimeProcessor(clickhouse_client,s3_client)
        processor.process()
     """
+
     def __init__(
         self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False
     ) -> None:
@@ -183,25 +186,32 @@ def __init__(
     def process(self) -> None:
         self.proceses_job_queue_times_historical()
 
-    def proceses_job_queue_times_historical(self, timestamp:str = "", repo: str = 'pytorch/pytorch') -> None:
+    def proceses_job_queue_times_historical(
+        self, snapshot_time: str = "", repo: str = "pytorch/pytorch"
+    ) -> None:
         # by default, we use current time as snapshot
-        snapshot_time = str(int(datetime.now().timestamp()))
-        if timestamp:
-            snapshot_time = timestamp
-
+        timestamp = str(int(datetime.now().timestamp()))
+        if snapshot_time:
+            timestamp = snapshot_time
 
         # fetches jobs that were in queue in given snapshot time, that are not being picked up by workers
-        queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot(timestamp,repo)
+        queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot(
+            timestamp, repo
+        )
         jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters)
 
         # fetches jobs that were in queue in given snapshot time, but were picked up by workers later of given snapshot time
-        picked_query, picked_params = query_picked_up_job_for_given_snapshot(timestamp,repo)
+        picked_query, picked_params = query_picked_up_job_for_given_snapshot(
+            timestamp, repo
+        )
         jobs_pick = self.process_in_queue_jobs(picked_query, picked_params)
 
-        datetime_str = datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
-        print(datetime_str,timestamp,len(jobs_in_queue),len(jobs_pick),)
-
-        info(f"Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later")
+        datetime_str = datetime.fromtimestamp(int(timestamp)).strftime(
+            "%Y-%m-%d %H:%M:%S"
+        )
+        info(
+            f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later"
+        )
         if len(jobs_in_queue) == 0 and len(jobs_pick) == 0:
             info(f"No jobs in queue at time {datetime_str}, skipping mutation to S3")
             return
@@ -209,26 +219,27 @@ def proceses_job_queue_times_historical(self, timestamp:str = "", repo: str = 'p
         key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
         result = jobs_in_queue + jobs_pick
         if self.is_dry_run:
-            info(
-                f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}"
-            )
+            info(f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}")
             print(json.dumps(result))
             return
 
         upload_to_s3_txt(self.s3_client, _bucket_name, key, result)
 
-    def process_in_queue_jobs(self, queryStr:str, parameters:Any) -> list[dict[str, Any]]:
+    def process_in_queue_jobs(
+        self, queryStr: str, parameters: Any
+    ) -> list[dict[str, Any]]:
         """
         post query process to remove duplicated jobs
         this is bc clickhouse client returns duplicated jobs for some reason
         """
         seen = set()
-        db_resp  = self.query(queryStr, parameters)
+        db_resp = self.query(queryStr, parameters)
         result = []
+
         for record in db_resp:
-            if record['html_url']in seen:
+            if record["html_url"] in seen:
                 continue
-            seen.add(record['html_url'])
+            seen.add(record["html_url"])
             result.append(record)
         return result
 
@@ -317,5 +328,6 @@ def main() -> None:
 
     QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process()
 
+
 if __name__ == "__main__":
     main()
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index c1d7258b4a..3de3e066d8 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -2,12 +2,10 @@
 import os
 import gzip
 
-from typing import Any, List, Tuple
-from unittest.mock import patch
+from typing import Any, List, Tuple, Dict
+from unittest.mock import patch,MagicMock
 from oss_ci_job_queue_time.lambda_function import (
     lambda_handler,
-    get_aws_s3_resource,
-    get_clickhouse_client,
 )
 
 
@@ -94,10 +92,18 @@ def get_default_result_rows(test_sample: str = "0"):
         ),
     ]
 
-
 def get_default_result_columns() -> Tuple:
     return ("queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time")
 
+def mock_query_result(query: str, parameters:str, rows_in_queue: List[Tuple], rows_picked: List[Tuple]) ->Any:
+    result = MagicMock()
+    if "LENGTH(job.steps) = 0" in query:
+        result.column_names = get_default_result_columns()
+        result.result_rows = rows_in_queue
+    if "LENGTH(job.steps) != 0'" in query:
+        result.column_names = get_default_result_columns()
+        result.result_rows = rows_picked
+    return result
 
 def mock_s3_resource_put(mock_s3_resource: Any) -> None:
     mock_s3 = mock_s3_resource.return_value
@@ -111,13 +117,13 @@ def get_mock_s3_resource_object(mock_s3_resource: Any):
 
 def mock_db_client(
     mock: Any,
-    result_rows: List[Tuple] = get_default_result_rows(),
-    result_columns: Tuple = get_default_result_columns(),
+    rows_in_queue: List[Tuple] = get_default_result_rows(),
+    rows_picked: List[Tuple] = [],
 ) -> None:
     mock_client = mock.return_value
-    mock_client.query.return_value.result_rows = result_rows
-    mock_client.query.return_value.column_names = result_columns
-
+    mock_client.query.side_effect = (
+        lambda query, parameters: mock_query_result(query,parameters, rows_in_queue, rows_picked)
+    )
 
 def set_default_env_variables():
     os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1"
@@ -135,7 +141,7 @@ def test_lambda_handler_when_row_result_is_empty(
         # prepare
         set_default_env_variables()
         mock_s3_resource_put(mock_s3_resource)
-        mock_db_client(mock_get_client, result_rows=[])
+        mock_db_client(mock_get_client,[],[])
 
         # execute
         lambda_handler(None, None)
@@ -168,7 +174,7 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(
 
         # assert clickhouse client
         mock_get_client.assert_called_once()
-        mock_get_client.return_value.query.assert_called_once()
+        self.assertEqual(mock_get_client.return_value.query.call_count, 2)
 
         # assert s3 resource
         mock_s3_resource.assert_called_once()

From 2814b4b7025b38cc2f061577416902b660247ab5 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 21:02:16 -0700
Subject: [PATCH 15/38] fix sync

---
 .../test_lambda_oss_ci_job_queue_time.py      | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index 3de3e066d8..8ee60e0a1c 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -3,7 +3,7 @@
 import gzip
 
 from typing import Any, List, Tuple, Dict
-from unittest.mock import patch,MagicMock
+from unittest.mock import patch, MagicMock
 from oss_ci_job_queue_time.lambda_function import (
     lambda_handler,
 )
@@ -14,7 +14,7 @@ def get_default_result_rows(test_sample: str = "0"):
     generate result rows for testing, this corrresponds to the following columns:
        'queue_s', 'repo', 'workflow_name', 'job_name', 'html_url', 'machine_type', 'time'
     """
-    if (test_sample == "0"):
+    if test_sample == "0":
         return [
             (
                 60000,
@@ -33,7 +33,8 @@ def get_default_result_rows(test_sample: str = "0"):
                 "runs/2/job/2",
                 "linux.rocm.gpu.2",
                 1742262372,
-            )]
+            ),
+        ]
 
     return [
         (
@@ -92,10 +93,22 @@ def get_default_result_rows(test_sample: str = "0"):
         ),
     ]
 
+
 def get_default_result_columns() -> Tuple:
-    return ("queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time")
+    return (
+        "queue_s",
+        "repo",
+        "workflow_name",
+        "job_name",
+        "html_url",
+        "machine_type",
+        "time",
+    )
 
-def mock_query_result(query: str, parameters:str, rows_in_queue: List[Tuple], rows_picked: List[Tuple]) ->Any:
+
+def mock_query_result(
+    query: str, parameters: str, rows_in_queue: List[Tuple], rows_picked: List[Tuple]
+) -> Any:
     result = MagicMock()
     if "LENGTH(job.steps) = 0" in query:
         result.column_names = get_default_result_columns()
@@ -105,6 +118,7 @@ def mock_query_result(query: str, parameters:str, rows_in_queue: List[Tuple], ro
         result.result_rows = rows_picked
     return result
 
+
 def mock_s3_resource_put(mock_s3_resource: Any) -> None:
     mock_s3 = mock_s3_resource.return_value
     mock_object = mock_s3.Object.return_value
@@ -121,10 +135,11 @@ def mock_db_client(
     rows_picked: List[Tuple] = [],
 ) -> None:
     mock_client = mock.return_value
-    mock_client.query.side_effect = (
-        lambda query, parameters: mock_query_result(query,parameters, rows_in_queue, rows_picked)
+    mock_client.query.side_effect = lambda query, parameters: mock_query_result(
+        query, parameters, rows_in_queue, rows_picked
     )
 
+
 def set_default_env_variables():
     os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1"
     os.environ["CLICKHOUSE_USERNAME"] = "user1"
@@ -141,7 +156,7 @@ def test_lambda_handler_when_row_result_is_empty(
         # prepare
         set_default_env_variables()
         mock_s3_resource_put(mock_s3_resource)
-        mock_db_client(mock_get_client,[],[])
+        mock_db_client(mock_get_client, [], [])
 
         # execute
         lambda_handler(None, None)

From ec39f93bbbd7b8350da11b9e870c951e26f70b8c Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 21:10:40 -0700
Subject: [PATCH 16/38] fix sync

---
 .../oss_ci_job_queue_time/lambda_function.py  |  5 +++-
 .../test_lambda_oss_ci_job_queue_time.py      | 29 +++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index e26e3fc9b7..eb00b37f54 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -18,6 +18,7 @@
 logging.basicConfig(level=logging.INFO)
 
 _bucket_name = "ossci-raw-job-status"
+# common query statement for in_queue jobs
 _in_queue_job_select_statement = """
 SELECT
     DATE_DIFF(
@@ -209,11 +210,13 @@ def proceses_job_queue_times_historical(
         datetime_str = datetime.fromtimestamp(int(timestamp)).strftime(
             "%Y-%m-%d %H:%M:%S"
         )
+
         info(
             f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later"
         )
+
         if len(jobs_in_queue) == 0 and len(jobs_pick) == 0:
-            info(f"No jobs in queue at time {datetime_str}, skipping mutation to S3")
+            info(f"No jobs were in queue at time {datetime_str}, skipping")
             return
 
         key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index 8ee60e0a1c..6fbcbc4f74 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -4,9 +4,7 @@
 
 from typing import Any, List, Tuple, Dict
 from unittest.mock import patch, MagicMock
-from oss_ci_job_queue_time.lambda_function import (
-    lambda_handler,
-)
+from oss_ci_job_queue_time.lambda_function import lambda_handler, main
 
 
 def get_default_result_rows(test_sample: str = "0"):
@@ -233,6 +231,31 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(
                     mock_s3_resource
                 ).return_value.put.assert_not_called()
 
+    @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
+    @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
+    def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_write(
+        self, mock_get_client, mock_s3_resource
+    ):
+        # prepare
+        set_default_env_variables()
+        mock_s3_resource_put(mock_s3_resource)
+        mock_db_client(mock_get_client)
+
+        # execute
+        main()
+
+        # assert
+
+        # assert clickhouse client
+        mock_get_client.assert_called_once()
+        self.assertEqual(mock_get_client.return_value.query.call_count, 2)
+
+        # assert s3 resource
+        mock_s3_resource.assert_called_once()
+        get_mock_s3_resource_object(
+            mock_s3_resource
+        ).return_value.put.assert_not_called()
+
 
 if __name__ == "__main__":
     unittest.main()

From 893e7191f999a1ea73af16527de9144ee50efd58 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 23:34:40 -0700
Subject: [PATCH 17/38] fix sync

---
 .../oss_ci_job_queue_time/lambda_function.py  | 365 ++++++++++++++++--
 .../oss_ci_job_queue_time/requirements.txt    |   3 +
 .../test_lambda_oss_ci_job_queue_time.py      |  89 +++--
 3 files changed, 390 insertions(+), 67 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index eb00b37f54..019bada68f 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -5,6 +5,9 @@
 import logging
 import os
 import gzip
+import threading
+import dateutil.parser
+import yaml
 
 import boto3  # type: ignore[import]
 import clickhouse_connect
@@ -13,12 +16,14 @@
 # Local imports
 from functools import lru_cache
 from logging import info
-from typing import Any
+from typing import Any, Optional, Dict, Set, Iterable, List, Tuple
+from github import Github, Auth
+from dateutil.parser import parse
+
 
 logging.basicConfig(level=logging.INFO)
 
 _bucket_name = "ossci-raw-job-status"
-# common query statement for in_queue jobs
 _in_queue_job_select_statement = """
 SELECT
     DATE_DIFF(
@@ -39,7 +44,7 @@
             job.labels[1]
         )
     ) AS machine_type,
-    toUnixTimestamp({timestamp:DateTime}) AS time,
+    toUnixTimestamp({timestamp:DateTime}) AS time
 FROM
     default.workflow_job job FINAL
     JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id
@@ -93,7 +98,248 @@ def upload_to_s3_txt(
     info(f"Done! Finish writing document to S3 {bucket_name}/{key} ")
 
 
-def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"):
+class LazyFileHistory:
+    """
+    Reads the content of a file from a GitHub repository on the version that it was on a specific time and date provided. It then caches the commits and file contents avoiding unnecessary requests to the GitHub API.
+    All public methods are thread-safe.
+    """
+
+    def __init__(self, repo: Any, path: str) -> None:
+        self.repo = repo
+        self.path = path
+        self._commits_cache = []
+        self._content_cache = {}
+        self._fetched_all_commits = False
+        self._lock = threading.RLock()
+
+    def is_unix_timestamp(self, value: str) -> bool:
+        """Check if the string is a valid Unix timestamp."""
+        if value.isdigit():  # Ensure it's numeric
+            try:
+                timestamp = int(value)
+                # Check if it's within a reasonable range (1970 to 2100)
+                datetime.fromtimestamp(timestamp)
+                return True
+            except (ValueError, OSError):
+                return False
+        return False
+
+    def get_version_after_timestamp(self, timestamp: str | datetime) -> Optional[str]:
+        try:
+            with self._lock:
+                if not isinstance(timestamp, datetime):
+                    if self.is_unix_timestamp(timestamp):
+                        timestamp = datetime.fromtimestamp(
+                            float(timestamp)
+                        ).astimezone()
+                    else:
+                        timestamp = parse(timestamp)
+                commit = self._find_earliest_after_in_cache(timestamp)
+                if commit:
+                    return self._fetch_content_for_commit(commit)
+
+                if not self._fetched_all_commits:
+                    commit = self._fetch_until_timestamp(timestamp)
+                    if commit:
+                        return self._fetch_content_for_commit(commit)
+        except Exception as e:
+            print(
+                f"Error fetching content for {self.repo} : {self.path} at {timestamp}: {e}"
+            )
+
+        return None
+
+    def _find_earliest_after_in_cache(self, timestamp: datetime) -> Optional[str]:
+        commits_after = [
+            c for c in self._commits_cache if c.commit.author.date > timestamp
+        ]
+        if not commits_after:
+            return None
+        return commits_after[-1]
+
+    def _fetch_until_timestamp(self, timestamp: datetime) -> Optional[str]:
+        all_commits = self.repo.get_commits(path=self.path)
+        known_shas = {c.sha for c in self._commits_cache}
+
+        newly_fetched = []
+
+        for commit in all_commits:
+            if commit.sha in known_shas:
+                break
+            newly_fetched.append(commit)
+
+            if commit.commit.author.date <= timestamp:
+                break
+
+        self._commits_cache.extend(newly_fetched)
+        self._commits_cache.sort(key=lambda c: c.commit.author.date, reverse=True)
+
+        if not newly_fetched:
+            self._fetched_all_commits = True
+
+        return self._find_earliest_after_in_cache(timestamp)
+
+    def _fetch_content_for_commit(self, commit: any) -> str:
+        if commit.sha not in self._content_cache:
+            print(
+                f"Fetching content for {self.repo} : {self.path} at {commit.commit.author.date} - {commit.sha}"
+            )
+            # We can retrieve the file content at a specific commit
+            file_content = self.repo.get_contents(
+                self.path, ref=commit.sha
+            ).decoded_content.decode()
+            self._content_cache[commit.sha] = file_content
+        return self._content_cache[commit.sha]
+
+
+def explode_runner_variants(
+    runner_configs: Dict[str, Dict[str, Any]]
+) -> Dict[str, Dict[str, Any]]:
+    runner_types_list = [i for i in runner_configs["runner_types"].items()]
+
+    for runner, runner_config in runner_types_list:
+        if "variants" in runner_config:
+            for variant, variant_config in runner_config["variants"].items():
+                if runner.startswith("lf."):
+                    runner_without_lf = runner[3:]
+                    variant_name = f"lf.{variant}.{runner_without_lf}"
+                else:
+                    variant_name = f"{variant}.{runner}"
+                runner_configs["runner_types"][variant_name] = {
+                    **runner_config,
+                    **variant_config,
+                }
+    return runner_configs
+
+
+def update_tags(
+    tag_categories: Dict[str, Set[str]], machine_types: Iterable[str]
+) -> None:
+    """
+    iterate through machine types from jobs, and update potential tags that it belongs to
+    """
+    for machine_type in machine_types:
+        if not machine_type:
+            continue
+        tag_categories["all"].add(machine_type)
+        if machine_type not in tag_categories["dynamic"]:
+            if "ubuntu" in machine_type.lower():
+                tag_categories["linux"].add(machine_type)
+                tag_categories["github"].add(machine_type)
+            else:
+                tag_categories["other"].add(machine_type)
+
+
+def create_tag_categorires(
+    runner_configs: Dict[str, Dict[str, Any]],
+    lf_runner_configs: Dict[str, Dict[str, Any]],
+) -> Dict[str, Set[str]]:
+    """
+    Create the tag_categorires, that are groups of runners with some common characteristics that we might find relevant
+    to view them in a group instead of individually.
+    """
+    breakdowns = {
+        "github": set(),  # provided by github
+        "pet": set(),  # managed as pet instances
+        "dynamic": set(),  # managed as auto-scaling instances
+        "ephemeral": set(),  # auto-scaling instances that are ephemeral
+        "nonephemeral": set(),  # auto-scaling instances that are not ephemeral
+        "linux": set(),  # linux instances
+        "linux-meta": set(),  # linux instances provided by meta
+        "linux-lf": set(),  # linux instances provided by Linux Foundation
+        "macos": set(),  # macos instances
+        "macos-meta": set(),  # macos instances provided by meta
+        "windows": set(),  # windows instances
+        "windows-meta": set(),  # windows instances provided by meta
+        "windows-lf": set(),  # windows instances provided by Linux Foundation
+        "all": set(),  # all instances
+        "lf": set(),  # instances managed by Linux Foundation
+        "meta": set(),  # instances managed by meta
+        "multi-tenant": set(),  # instances that are multi-tenant
+        "other": set(),  # other instances
+    }
+
+    github_mac_runners = (
+        "macos-12",
+        "macos-12-xl",
+        "macos-13-large",
+        "macos-13-xl",
+        "macos-13-xlarge",
+        "macos-14-arm64",
+        "macos-14-xlarge",
+    )
+    breakdowns["github"].update(github_mac_runners)
+    breakdowns["macos"].update(github_mac_runners)
+
+    meta_pet_mac_runners = (
+        "macos-m1-12",
+        "macos-m1-13",
+        "macos-m1-14",
+        "macos-m1-stable",
+        "macos-m2-14",
+        "macos-m2-15",
+        "macos-m2-max",
+    )
+    breakdowns["meta"].update(meta_pet_mac_runners)
+    breakdowns["macos"].update(meta_pet_mac_runners)
+    breakdowns["pet"].update(meta_pet_mac_runners)
+
+    meta_pet_nvidia = (
+        "linux.aws.a100",
+        "linux.aws.h100",
+    )
+    breakdowns["meta"].update(meta_pet_nvidia)
+    breakdowns["linux"].update(meta_pet_nvidia)
+    breakdowns["linux-meta"].update(meta_pet_nvidia)
+    breakdowns["pet"].update(meta_pet_nvidia)
+    breakdowns["multi-tenant"].update(meta_pet_nvidia)
+
+    all_runners_configs = (
+        runner_configs["runner_types"] | lf_runner_configs["runner_types"]
+    )
+
+    for runner, runner_config in all_runners_configs.items():
+        breakdowns["dynamic"].add(runner)
+
+        if "is_ephemeral" in runner_config and runner_config["is_ephemeral"]:
+            breakdowns["ephemeral"].add(runner)
+        else:
+            breakdowns["nonephemeral"].add(runner)
+
+        if runner_config["os"].lower() == "linux":
+            breakdowns["linux"].add(runner)
+        elif runner_config["os"].lower() == "windows":
+            breakdowns["windows"].add(runner)
+
+    for runner, runner_config in runner_configs["runner_types"].items():
+        breakdowns["meta"].add(runner)
+
+        if runner_config["os"].lower() == "linux":
+            breakdowns["linux-meta"].add(runner)
+        elif runner_config["os"].lower() == "windows":
+            breakdowns["windows-meta"].add(runner)
+
+    for runner, runner_config in lf_runner_configs["runner_types"].items():
+        breakdowns["lf"].add(runner)
+
+        if runner_config["os"].lower() == "linux":
+            breakdowns["linux-lf"].add(runner)
+        elif runner_config["os"].lower() == "windows":
+            breakdowns["windows-lf"].add(runner)
+
+    return breakdowns
+
+
+def get_runner_config(
+    retriever: LazyFileHistory, start_time: str | datetime
+) -> Dict[str, Dict[str, Any]]:
+    contents = retriever.get_version_after_timestamp(start_time)
+    if contents:
+        return explode_runner_variants(yaml.safe_load(contents))
+    return {"runner_types": {}}
+
+
+def get_query_statement_for_picked_up_job(time: str, repo: str = "pytorch/pytorch"):
     """
     this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later
     """
@@ -123,7 +369,6 @@ def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytor
         queue_s DESC
     """
     query = s1 + _in_queue_job_select_statement + s2
-
     parameters = {
         "timestamp": time,
         "repo": repo,
@@ -131,7 +376,7 @@ def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytor
     return query, parameters
 
 
-def query_in_queue_jobs_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"):
+def get_query_statement_for_queueing_jobs(time: str, repo: str = "pytorch/pytorch"):
     """
     this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
     """
@@ -168,6 +413,28 @@ def query_in_queue_jobs_for_given_snapshot(time: str, repo: str = "pytorch/pytor
     return query, parameters
 
 
+def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
+    auth = Auth.Token(github_access_token)
+    test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra")
+    pytorch_repo = Github(auth=auth).get_repo("pytorch/pytorch")
+
+    meta_runner_config_retriever = LazyFileHistory(
+        test_infra_repo, ".github/scale-config.yml"
+    )
+    lf_runner_config_retriever = LazyFileHistory(
+        test_infra_repo, ".github/lf-scale-config.yml"
+    )
+    old_lf_lf_runner_config_retriever = LazyFileHistory(
+        pytorch_repo, ".github/lf-scale-config.yml"
+    )
+
+    return (
+        meta_runner_config_retriever,
+        lf_runner_config_retriever,
+        old_lf_lf_runner_config_retriever,
+    )
+
+
 class QueueTimeProcessor:
     """
     this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time
@@ -185,24 +452,33 @@ def __init__(
         self.is_dry_run = is_dry_run
 
     def process(self) -> None:
-        self.proceses_job_queue_times_historical()
-
-    def proceses_job_queue_times_historical(
-        self, snapshot_time: str = "", repo: str = "pytorch/pytorch"
-    ) -> None:
-        # by default, we use current time as snapshot
-        timestamp = str(int(datetime.now().timestamp()))
-        if snapshot_time:
-            timestamp = snapshot_time
+        github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "")
+        if not github_access_token:
+            raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
+
+        (
+            meta_runner_config_retriever,
+            lf_runner_config_retriever,
+            old_lf_lf_runner_config_retriever,
+        ) = get_config_retrievers(github_access_token)
+        self.proceses_job_queue_times_historical(
+            meta_runner_config_retriever,
+            lf_runner_config_retriever,
+            old_lf_lf_runner_config_retriever,
+        )
 
-        # fetches jobs that were in queue in given snapshot time, that are not being picked up by workers
-        queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot(
+    def snapshot_jobs_in_queue(
+        self, timestamp: str = "", repo: str = "pytorch/pytorch"
+    ) -> List[Dict[str, Any]]:
+        # in given snapshot time, fetches jobs that were in queue but not being picked up by workers
+        queued_query, queued_parameters = get_query_statement_for_queueing_jobs(
             timestamp, repo
         )
         jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters)
 
-        # fetches jobs that were in queue in given snapshot time, but were picked up by workers later of given snapshot time
-        picked_query, picked_params = query_picked_up_job_for_given_snapshot(
+        # in queue in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time
+        # this happens when the snapshot time is not latest timestamp
+        picked_query, picked_params = get_query_statement_for_picked_up_job(
             timestamp, repo
         )
         jobs_pick = self.process_in_queue_jobs(picked_query, picked_params)
@@ -214,19 +490,57 @@ def proceses_job_queue_times_historical(
         info(
             f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later"
         )
+        result = jobs_in_queue + jobs_pick
+        return result
 
-        if len(jobs_in_queue) == 0 and len(jobs_pick) == 0:
-            info(f"No jobs were in queue at time {datetime_str}, skipping")
+    def proceses_job_queue_times_historical(
+        self,
+        meta_runner_config_retriever,
+        lf_runner_config_retriever,
+        old_lf_lf_runner_config_retriever,
+        snapshot_time: str = "",
+        repo: str = "pytorch/pytorch",
+    ) -> None:
+        # by default, we use current time as snapshot
+        timestamp = str(int(datetime.now().timestamp()))
+        if snapshot_time:
+            timestamp = snapshot_time
+
+        snapshot = self.snapshot_jobs_in_queue(timestamp, repo)
+        if len(snapshot) == 0:
+            info(f"No jobs in queue at time: {timestamp}")
             return
 
+        lf_runner_config = get_runner_config(lf_runner_config_retriever, timestamp)
+
+        if not lf_runner_config or not lf_runner_config["runner_types"]:
+            lf_runner_config = get_runner_config(
+                old_lf_lf_runner_config_retriever, timestamp
+            )
+
+        # create dictionary of tags with set of targeting machine types
+        tag_categories = create_tag_categorires(
+            get_runner_config(meta_runner_config_retriever, timestamp), lf_runner_config
+        )
+        update_tags(tag_categories, set([job["machine_type"] for job in snapshot]))
+
+        # iterate throught jobs, and update tags for each job
+        for job in snapshot:
+            job_tags = []
+            for tag in tag_categories:
+                if job["machine_type"] in tag_categories[tag]:
+                    job_tags.append(tag)
+            job["tags"] = job_tags
+
         key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
-        result = jobs_in_queue + jobs_pick
         if self.is_dry_run:
-            info(f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}")
-            print(json.dumps(result))
+            info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}")
+            info(json.dumps(snapshot))
             return
 
-        upload_to_s3_txt(self.s3_client, _bucket_name, key, result)
+        print("Yang", snapshot)
+
+        upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot)
 
     def process_in_queue_jobs(
         self, queryStr: str, parameters: Any
@@ -238,7 +552,6 @@ def process_in_queue_jobs(
         seen = set()
         db_resp = self.query(queryStr, parameters)
         result = []
-
         for record in db_resp:
             if record["html_url"] in seen:
                 continue
diff --git a/aws/lambda/oss_ci_job_queue_time/requirements.txt b/aws/lambda/oss_ci_job_queue_time/requirements.txt
index 3e22fde96f..87c33c2e7f 100644
--- a/aws/lambda/oss_ci_job_queue_time/requirements.txt
+++ b/aws/lambda/oss_ci_job_queue_time/requirements.txt
@@ -1,2 +1,5 @@
 clickhouse_connect==0.8.5
 boto3==1.35.33
+PyGithub==1.59.0
+python-dateutil==2.8.2
+PyYAML==6.0.1
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index 6fbcbc4f74..46412d15a8 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -142,41 +142,53 @@ def set_default_env_variables():
     os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1"
     os.environ["CLICKHOUSE_USERNAME"] = "user1"
     os.environ["CLICKHOUSE_PASSWORD"] = "pwd1"
+    os.environ["GITHUB_ACCESS_TOKEN"] = "token1"
 
 
 class Test(unittest.TestCase):
-    @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
-    @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
-    def test_lambda_handler_when_row_result_is_empty(
-        self, mock_get_client, mock_s3_resource
-    ):
+    def setUp(self):
+        patcher1 = patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
+        patcher2 = patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
+        patcher3 = patch("oss_ci_job_queue_time.lambda_function.get_runner_config")
+        patcher4 = patch("oss_ci_job_queue_time.lambda_function.get_config_retrievers")
+
+        self.mock_s3_resource = patcher1.start()
+        self.mock_get_client = patcher2.start()
+        self.mock_get_runner_config = patcher3.start()
+        self.mock_get_config_retrievers = patcher4.start()
+
+        self.mock_get_runner_config.return_value = {"runner_types": {}}
+        self.mock_get_config_retrievers.return_value = ({}, {}, {})
+
+        self.addCleanup(patcher1.stop)  # Ensure patchers stop after each test
+        self.addCleanup(patcher2.stop)
+        self.addCleanup(patcher3.stop)
+        self.addCleanup(patcher4.stop)
+
+    def test_lambda_handler_when_row_result_is_empty(self):
         print("test_lambda_handler_when_row_result_is_empty ")
         # prepare
         set_default_env_variables()
-        mock_s3_resource_put(mock_s3_resource)
-        mock_db_client(mock_get_client, [], [])
+        mock_s3_resource_put(self.mock_s3_resource)
+        mock_db_client(self.mock_get_client, [], [])
 
         # execute
         lambda_handler(None, None)
 
         # assert
-        mock_get_client.assert_called_once()
+        self.mock_get_client.assert_called_once()
         get_mock_s3_resource_object(
-            mock_s3_resource
+            self.mock_s3_resource
         ).return_value.put.assert_not_called()
 
-    @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
-    @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
-    def test_lambda_handler_when_lambda_happy_flow_then_success(
-        self, mock_get_client, mock_s3_resource
-    ):
+    def test_lambda_handler_when_lambda_happy_flow_then_success(self):
         # prepare
         set_default_env_variables()
-        mock_s3_resource_put(mock_s3_resource)
-        mock_db_client(mock_get_client)
+        mock_s3_resource_put(self.mock_s3_resource)
+        mock_db_client(self.mock_get_client)
 
-        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372}\n'
-        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372}\n'
+        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other"]}\n'
+        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["all", "other"]}\n'
         expected_s3_body = expected_r1 + expected_r2
         expect = gzip.compress(expected_s3_body)
 
@@ -186,36 +198,33 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(
         # assert
 
         # assert clickhouse client
-        mock_get_client.assert_called_once()
-        self.assertEqual(mock_get_client.return_value.query.call_count, 2)
+        self.mock_get_client.assert_called_once()
+        self.assertEqual(self.mock_get_client.return_value.query.call_count, 2)
 
         # assert s3 resource
-        mock_s3_resource.assert_called_once()
+        self.mock_s3_resource.assert_called_once()
         get_mock_s3_resource_object(
-            mock_s3_resource
+            self.mock_s3_resource
         ).return_value.put.assert_called_once()
         get_mock_s3_resource_object(
-            mock_s3_resource
+            self.mock_s3_resource
         ).return_value.put.assert_called_once_with(
             Body=expect, ContentEncoding="gzip", ContentType="text/plain"
         )
 
-    @patch("boto3.resource")
-    @patch("clickhouse_connect.get_client")
-    def test_lambda_handler_when_missing_required_env_vars_then_throws_error(
-        self, mock_get_client, mock_s3_resource
-    ):
+    def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self):
         test_cases = [
             ("CLICKHOUSE_ENDPOINT"),
             ("CLICKHOUSE_USERNAME"),
             ("CLICKHOUSE_PASSWORD"),
+            ("GITHUB_ACCESS_TOKEN"),
         ]
 
         for x in test_cases:
             with self.subTest(x=x):
                 # prepare
-                mock_get_client.reset_mock(return_value=True)
-                mock_s3_resource.reset_mock(return_value=True)
+                self.mock_get_client.reset_mock(return_value=True)
+                self.mock_s3_resource.reset_mock(return_value=True)
 
                 set_default_env_variables()
                 os.environ[x] = ""
@@ -226,20 +235,18 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(
 
                 # assert
                 self.assertTrue(x in str(context.exception))
-                mock_get_client.return_value.query.assert_not_called()
+                self.mock_get_client.return_value.query.assert_not_called()
                 get_mock_s3_resource_object(
-                    mock_s3_resource
+                    self.mock_s3_resource
                 ).return_value.put.assert_not_called()
 
-    @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource")
-    @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
     def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_write(
-        self, mock_get_client, mock_s3_resource
+        self,
     ):
         # prepare
         set_default_env_variables()
-        mock_s3_resource_put(mock_s3_resource)
-        mock_db_client(mock_get_client)
+        mock_s3_resource_put(self.mock_s3_resource)
+        mock_db_client(self.mock_get_client)
 
         # execute
         main()
@@ -247,13 +254,13 @@ def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_w
         # assert
 
         # assert clickhouse client
-        mock_get_client.assert_called_once()
-        self.assertEqual(mock_get_client.return_value.query.call_count, 2)
+        self.mock_get_client.assert_called_once()
+        self.assertEqual(self.mock_get_client.return_value.query.call_count, 2)
 
         # assert s3 resource
-        mock_s3_resource.assert_called_once()
+        self.mock_s3_resource.assert_called_once()
         get_mock_s3_resource_object(
-            mock_s3_resource
+            self.mock_s3_resource
         ).return_value.put.assert_not_called()
 
 

From 02881756418f63ecc508709b9b4fd9336aa0c0aa Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 23:53:04 -0700
Subject: [PATCH 18/38] add tags

---
 .../oss_ci_job_queue_time/lambda_function.py  | 54 ++++++++++---------
 .../test_lambda_oss_ci_job_queue_time.py      |  4 +-
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 019bada68f..cd774f9fcd 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -222,6 +222,11 @@ def update_tags(
         if not machine_type:
             continue
         tag_categories["all"].add(machine_type)
+
+        if machine_type.startswith("linux.rocm.gpu"):
+            tag_categories["linux"].add(machine_type)
+            tag_categories["linux-amd"].add(machine_type)
+
         if machine_type not in tag_categories["dynamic"]:
             if "ubuntu" in machine_type.lower():
                 tag_categories["linux"].add(machine_type)
@@ -238,7 +243,7 @@ def create_tag_categorires(
     Create the tag_categorires, that are groups of runners with some common characteristics that we might find relevant
     to view them in a group instead of individually.
     """
-    breakdowns = {
+    tag_dict = {
         "github": set(),  # provided by github
         "pet": set(),  # managed as pet instances
         "dynamic": set(),  # managed as auto-scaling instances
@@ -247,6 +252,7 @@ def create_tag_categorires(
         "linux": set(),  # linux instances
         "linux-meta": set(),  # linux instances provided by meta
         "linux-lf": set(),  # linux instances provided by Linux Foundation
+        "linux-amd": set(),  # linux instances provided by amd. for instance linux.rocm.gpu.2
         "macos": set(),  # macos instances
         "macos-meta": set(),  # macos instances provided by meta
         "windows": set(),  # windows instances
@@ -268,8 +274,8 @@ def create_tag_categorires(
         "macos-14-arm64",
         "macos-14-xlarge",
     )
-    breakdowns["github"].update(github_mac_runners)
-    breakdowns["macos"].update(github_mac_runners)
+    tag_dict["github"].update(github_mac_runners)
+    tag_dict["macos"].update(github_mac_runners)
 
     meta_pet_mac_runners = (
         "macos-m1-12",
@@ -280,54 +286,53 @@ def create_tag_categorires(
         "macos-m2-15",
         "macos-m2-max",
     )
-    breakdowns["meta"].update(meta_pet_mac_runners)
-    breakdowns["macos"].update(meta_pet_mac_runners)
-    breakdowns["pet"].update(meta_pet_mac_runners)
+    tag_dict["meta"].update(meta_pet_mac_runners)
+    tag_dict["macos"].update(meta_pet_mac_runners)
+    tag_dict["pet"].update(meta_pet_mac_runners)
 
     meta_pet_nvidia = (
         "linux.aws.a100",
         "linux.aws.h100",
     )
-    breakdowns["meta"].update(meta_pet_nvidia)
-    breakdowns["linux"].update(meta_pet_nvidia)
-    breakdowns["linux-meta"].update(meta_pet_nvidia)
-    breakdowns["pet"].update(meta_pet_nvidia)
-    breakdowns["multi-tenant"].update(meta_pet_nvidia)
+    tag_dict["meta"].update(meta_pet_nvidia)
+    tag_dict["linux"].update(meta_pet_nvidia)
+    tag_dict["linux-meta"].update(meta_pet_nvidia)
+    tag_dict["pet"].update(meta_pet_nvidia)
+    tag_dict["multi-tenant"].update(meta_pet_nvidia)
 
     all_runners_configs = (
         runner_configs["runner_types"] | lf_runner_configs["runner_types"]
     )
 
     for runner, runner_config in all_runners_configs.items():
-        breakdowns["dynamic"].add(runner)
+        tag_dict["dynamic"].add(runner)
 
         if "is_ephemeral" in runner_config and runner_config["is_ephemeral"]:
-            breakdowns["ephemeral"].add(runner)
+            tag_dict["ephemeral"].add(runner)
         else:
-            breakdowns["nonephemeral"].add(runner)
+            tag_dict["nonephemeral"].add(runner)
 
         if runner_config["os"].lower() == "linux":
-            breakdowns["linux"].add(runner)
+            tag_dict["linux"].add(runner)
         elif runner_config["os"].lower() == "windows":
-            breakdowns["windows"].add(runner)
+            tag_dict["windows"].add(runner)
 
     for runner, runner_config in runner_configs["runner_types"].items():
-        breakdowns["meta"].add(runner)
+        tag_dict["meta"].add(runner)
 
         if runner_config["os"].lower() == "linux":
-            breakdowns["linux-meta"].add(runner)
+            tag_dict["linux-meta"].add(runner)
         elif runner_config["os"].lower() == "windows":
-            breakdowns["windows-meta"].add(runner)
+            tag_dict["windows-meta"].add(runner)
 
     for runner, runner_config in lf_runner_configs["runner_types"].items():
-        breakdowns["lf"].add(runner)
+        tag_dict["lf"].add(runner)
 
         if runner_config["os"].lower() == "linux":
-            breakdowns["linux-lf"].add(runner)
+            tag_dict["linux-lf"].add(runner)
         elif runner_config["os"].lower() == "windows":
-            breakdowns["windows-lf"].add(runner)
-
-    return breakdowns
+            tag_dict["windows-lf"].add(runner)
+    return tag_dict
 
 
 def get_runner_config(
@@ -530,6 +535,7 @@ def proceses_job_queue_times_historical(
             for tag in tag_categories:
                 if job["machine_type"] in tag_categories[tag]:
                     job_tags.append(tag)
+            job_tags.append(job["machine_type"])
             job["tags"] = job_tags
 
         key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index 46412d15a8..d19dd07205 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -187,8 +187,8 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(self):
         mock_s3_resource_put(self.mock_s3_resource)
         mock_db_client(self.mock_get_client)
 
-        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other"]}\n'
-        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["all", "other"]}\n'
+        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other", "linux.aws.h100"]}\n'
+        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["linux", "linux-amd", "all", "other", "linux.rocm.gpu.2"]}\n'
         expected_s3_body = expected_r1 + expected_r2
         expect = gzip.compress(expected_s3_body)
 

From d3f4cf7cab10fca81b5cf402b325b8d80441be65 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 18 Mar 2025 23:57:16 -0700
Subject: [PATCH 19/38] add tags

---
 aws/lambda/oss_ci_job_queue_time/lambda_function.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index cd774f9fcd..b2feef02fa 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -511,19 +511,18 @@ def proceses_job_queue_times_historical(
         if snapshot_time:
             timestamp = snapshot_time
 
+        # fetch jobs in queue at given snapshot time
         snapshot = self.snapshot_jobs_in_queue(timestamp, repo)
         if len(snapshot) == 0:
             info(f"No jobs in queue at time: {timestamp}")
             return
 
+        # create dictionary of tags with set of targeting machine types
         lf_runner_config = get_runner_config(lf_runner_config_retriever, timestamp)
-
         if not lf_runner_config or not lf_runner_config["runner_types"]:
             lf_runner_config = get_runner_config(
                 old_lf_lf_runner_config_retriever, timestamp
             )
-
-        # create dictionary of tags with set of targeting machine types
         tag_categories = create_tag_categorires(
             get_runner_config(meta_runner_config_retriever, timestamp), lf_runner_config
         )

From c80241a78822aba015417f993fbae8804907bb95 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:09:45 -0700
Subject: [PATCH 20/38] add tags

---
 .../oss_ci_job_queue_time/lambda_function.py  | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index b2feef02fa..987e4fa5e1 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -53,6 +53,8 @@
 
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
+
+    # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True, verify=False
     )
@@ -450,11 +452,12 @@ class QueueTimeProcessor:
     """
 
     def __init__(
-        self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False
+        self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False, local_output: bool = False
     ) -> None:
         self.clickhouse_client = clickhouse_client
         self.s3_client = s3_client
         self.is_dry_run = is_dry_run
+        self.local_output = local_output and is_dry_run
 
     def process(self) -> None:
         github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "")
@@ -540,10 +543,15 @@ def proceses_job_queue_times_historical(
         key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
         if self.is_dry_run:
             info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}")
-            info(json.dumps(snapshot))
+            if self.local_output:
+                file_name = f"job_queue_times_historical_snapshot_{timestamp}.json"
+                info(f"[Dry Run Mode]: local output to {file_name}.json")
+                with open(file_name, "w") as f:
+                    f.write(json.dumps(snapshot))
+            else:
+                info(json.dumps(snapshot))
             return
 
-        print("Yang", snapshot)
 
         upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot)
 
@@ -620,6 +628,17 @@ def parse_args() -> argparse.Namespace:
         default=os.getenv("CLICKHOUSE_PASSWORD", ""),
         help="the clickhouse password for the user name",
     )
+    parser.add_argument(
+        "--github-access-token",
+        type=str,
+        default=os.getenv("GITHUB_ACCESS_TOKEN", ""),
+        help="the github access token to access github api",
+    )
+    parser.add_argument(
+        "--local-output",
+        action="store_true",
+        help="when set, generate json result in local environment. this is only used for local test environment when dry-run is enabled",
+    )
     parser.add_argument(
         "--not-dry-run",
         action="store_true",
@@ -640,6 +659,7 @@ def main() -> None:
     os.environ["CLICKHOUSE_ENDPOINT"] = arguments.clickhouse_endpoint
     os.environ["CLICKHOUSE_USERNAME"] = arguments.clickhouse_username
     os.environ["CLICKHOUSE_PASSWORD"] = arguments.clickhouse_password
+    os.environ["GITHUB_ACCESS_TOKEN"] = arguments.github_access_token
 
     db_client = get_clickhouse_client_environment()
     s3_client = get_aws_s3_resource()
@@ -647,7 +667,7 @@ def main() -> None:
     # always run in dry-run mode in local environment, unless it's disabled.
     is_dry_run = not arguments.not_dry_run
 
-    QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process()
+    QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output).process()
 
 
 if __name__ == "__main__":

From ce3b8a55b6770b886ee49cfc053ad225ce98ef40 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:12:45 -0700
Subject: [PATCH 21/38] add tags

---
 .github/workflows/tests.yml            | 2 +-
 aws/lambda/tests/test_requirements.txt | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 aws/lambda/tests/test_requirements.txt

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 819f028e41..310db1a554 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -51,7 +51,7 @@ jobs:
         echo ::group::setup Python environment
         python -m venv .venv/
         source .venv/bin/activate
-        pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 clickhouse-connect==0.8.5
+        pip install -r aws/lambda/tests/test_requirements.txt
         echo ::endgroup::
 
         # Test aws lambda
diff --git a/aws/lambda/tests/test_requirements.txt b/aws/lambda/tests/test_requirements.txt
new file mode 100644
index 0000000000..87c33c2e7f
--- /dev/null
+++ b/aws/lambda/tests/test_requirements.txt
@@ -0,0 +1,5 @@
+clickhouse_connect==0.8.5
+boto3==1.35.33
+PyGithub==1.59.0
+python-dateutil==2.8.2
+PyYAML==6.0.1

From 7eefe8de3de2bf86a05a38510a423b8e0312a1d6 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:20:01 -0700
Subject: [PATCH 22/38] add tags

---
 .../oss_ci_job_queue_time/lambda_function.py  | 95 ++++++++++---------
 1 file changed, 50 insertions(+), 45 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 987e4fa5e1..c61036ae20 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -53,7 +53,6 @@
 
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
-
     # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True, verify=False
@@ -215,7 +214,7 @@ def explode_runner_variants(
 
 
 def update_tags(
-    tag_categories: Dict[str, Set[str]], machine_types: Iterable[str]
+    runner_labels: Dict[str, Set[str]], machine_types: Iterable[str]
 ) -> None:
     """
     iterate through machine types from jobs, and update potential tags that it belongs to
@@ -223,29 +222,29 @@ def update_tags(
     for machine_type in machine_types:
         if not machine_type:
             continue
-        tag_categories["all"].add(machine_type)
+        runner_labels["all"].add(machine_type)
 
         if machine_type.startswith("linux.rocm.gpu"):
-            tag_categories["linux"].add(machine_type)
-            tag_categories["linux-amd"].add(machine_type)
+            runner_labels["linux"].add(machine_type)
+            runner_labels["linux-amd"].add(machine_type)
 
-        if machine_type not in tag_categories["dynamic"]:
+        if machine_type not in runner_labels["dynamic"]:
             if "ubuntu" in machine_type.lower():
-                tag_categories["linux"].add(machine_type)
-                tag_categories["github"].add(machine_type)
+                runner_labels["linux"].add(machine_type)
+                runner_labels["github"].add(machine_type)
             else:
-                tag_categories["other"].add(machine_type)
+                runner_labels["other"].add(machine_type)
 
 
-def create_tag_categorires(
+def create_runner_labels(
     runner_configs: Dict[str, Dict[str, Any]],
     lf_runner_configs: Dict[str, Dict[str, Any]],
 ) -> Dict[str, Set[str]]:
     """
-    Create the tag_categorires, that are groups of runners with some common characteristics that we might find relevant
+    Create the runner_labels, that are groups of runners with some common characteristics that we might find relevant
     to view them in a group instead of individually.
     """
-    tag_dict = {
+    runner_labels_dict = {
         "github": set(),  # provided by github
         "pet": set(),  # managed as pet instances
         "dynamic": set(),  # managed as auto-scaling instances
@@ -276,8 +275,8 @@ def create_tag_categorires(
         "macos-14-arm64",
         "macos-14-xlarge",
     )
-    tag_dict["github"].update(github_mac_runners)
-    tag_dict["macos"].update(github_mac_runners)
+    runner_labels_dict["github"].update(github_mac_runners)
+    runner_labels_dict["macos"].update(github_mac_runners)
 
     meta_pet_mac_runners = (
         "macos-m1-12",
@@ -288,53 +287,53 @@ def create_tag_categorires(
         "macos-m2-15",
         "macos-m2-max",
     )
-    tag_dict["meta"].update(meta_pet_mac_runners)
-    tag_dict["macos"].update(meta_pet_mac_runners)
-    tag_dict["pet"].update(meta_pet_mac_runners)
+    runner_labels_dict["meta"].update(meta_pet_mac_runners)
+    runner_labels_dict["macos"].update(meta_pet_mac_runners)
+    runner_labels_dict["pet"].update(meta_pet_mac_runners)
 
     meta_pet_nvidia = (
         "linux.aws.a100",
         "linux.aws.h100",
     )
-    tag_dict["meta"].update(meta_pet_nvidia)
-    tag_dict["linux"].update(meta_pet_nvidia)
-    tag_dict["linux-meta"].update(meta_pet_nvidia)
-    tag_dict["pet"].update(meta_pet_nvidia)
-    tag_dict["multi-tenant"].update(meta_pet_nvidia)
+    runner_labels_dict["meta"].update(meta_pet_nvidia)
+    runner_labels_dict["linux"].update(meta_pet_nvidia)
+    runner_labels_dict["linux-meta"].update(meta_pet_nvidia)
+    runner_labels_dict["pet"].update(meta_pet_nvidia)
+    runner_labels_dict["multi-tenant"].update(meta_pet_nvidia)
 
     all_runners_configs = (
         runner_configs["runner_types"] | lf_runner_configs["runner_types"]
     )
 
     for runner, runner_config in all_runners_configs.items():
-        tag_dict["dynamic"].add(runner)
+        runner_labels_dict["dynamic"].add(runner)
 
         if "is_ephemeral" in runner_config and runner_config["is_ephemeral"]:
-            tag_dict["ephemeral"].add(runner)
+            runner_labels_dict["ephemeral"].add(runner)
         else:
-            tag_dict["nonephemeral"].add(runner)
+            runner_labels_dict["nonephemeral"].add(runner)
 
         if runner_config["os"].lower() == "linux":
-            tag_dict["linux"].add(runner)
+            runner_labels_dict["linux"].add(runner)
         elif runner_config["os"].lower() == "windows":
-            tag_dict["windows"].add(runner)
+            runner_labels_dict["windows"].add(runner)
 
     for runner, runner_config in runner_configs["runner_types"].items():
-        tag_dict["meta"].add(runner)
+        runner_labels_dict["meta"].add(runner)
 
         if runner_config["os"].lower() == "linux":
-            tag_dict["linux-meta"].add(runner)
+            runner_labels_dict["linux-meta"].add(runner)
         elif runner_config["os"].lower() == "windows":
-            tag_dict["windows-meta"].add(runner)
+            runner_labels_dict["windows-meta"].add(runner)
 
     for runner, runner_config in lf_runner_configs["runner_types"].items():
-        tag_dict["lf"].add(runner)
+        runner_labels_dict["lf"].add(runner)
 
         if runner_config["os"].lower() == "linux":
-            tag_dict["linux-lf"].add(runner)
+            runner_labels_dict["linux-lf"].add(runner)
         elif runner_config["os"].lower() == "windows":
-            tag_dict["windows-lf"].add(runner)
-    return tag_dict
+            runner_labels_dict["windows-lf"].add(runner)
+    return runner_labels_dict
 
 
 def get_runner_config(
@@ -452,7 +451,11 @@ class QueueTimeProcessor:
     """
 
     def __init__(
-        self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False, local_output: bool = False
+        self,
+        clickhouse_client: Any,
+        s3_client: Any,
+        is_dry_run: bool = False,
+        local_output: bool = False,
     ) -> None:
         self.clickhouse_client = clickhouse_client
         self.s3_client = s3_client
@@ -526,19 +529,20 @@ def proceses_job_queue_times_historical(
             lf_runner_config = get_runner_config(
                 old_lf_lf_runner_config_retriever, timestamp
             )
-        tag_categories = create_tag_categorires(
+        runner_labels = create_runner_labels(
             get_runner_config(meta_runner_config_retriever, timestamp), lf_runner_config
         )
-        update_tags(tag_categories, set([job["machine_type"] for job in snapshot]))
+        update_tags(runner_labels, set([job["machine_type"] for job in snapshot]))
 
         # iterate throught jobs, and update tags for each job
         for job in snapshot:
-            job_tags = []
-            for tag in tag_categories:
-                if job["machine_type"] in tag_categories[tag]:
-                    job_tags.append(tag)
-            job_tags.append(job["machine_type"])
-            job["tags"] = job_tags
+            job_labels = []
+            for tag in runner_labels:
+                if job["machine_type"] in runner_labels[tag]:
+                    job_labels.append(tag)
+            # add job's own machine type to runner labels
+            job_labels.append(job["machine_type"])
+            job["runner_labels"] = job_labels
 
         key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
         if self.is_dry_run:
@@ -552,7 +556,6 @@ def proceses_job_queue_times_historical(
                 info(json.dumps(snapshot))
             return
 
-
         upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot)
 
     def process_in_queue_jobs(
@@ -667,7 +670,9 @@ def main() -> None:
     # always run in dry-run mode in local environment, unless it's disabled.
     is_dry_run = not arguments.not_dry_run
 
-    QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output).process()
+    QueueTimeProcessor(
+        db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output
+    ).process()
 
 
 if __name__ == "__main__":

From 70542e15fd5a8f80db6ef0128c5294f606f71fa7 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:23:19 -0700
Subject: [PATCH 23/38] add tags

---
 aws/lambda/tests/test_requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aws/lambda/tests/test_requirements.txt b/aws/lambda/tests/test_requirements.txt
index 87c33c2e7f..b6abeffc7b 100644
--- a/aws/lambda/tests/test_requirements.txt
+++ b/aws/lambda/tests/test_requirements.txt
@@ -3,3 +3,5 @@ boto3==1.35.33
 PyGithub==1.59.0
 python-dateutil==2.8.2
 PyYAML==6.0.1
+pip==23.0.1
+pytest==7.2.0

From b5d2a49d66c6dc53727cf31ac60c4e00591be70f Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:27:40 -0700
Subject: [PATCH 24/38] add tags

---
 aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index d19dd07205..de7bfd1a93 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -187,8 +187,8 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(self):
         mock_s3_resource_put(self.mock_s3_resource)
         mock_db_client(self.mock_get_client)
 
-        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other", "linux.aws.h100"]}\n'
-        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["linux", "linux-amd", "all", "other", "linux.rocm.gpu.2"]}\n'
+        expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "runner_labels": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other", "linux.aws.h100"]}\n'
+        expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "runner_labels": ["linux", "linux-amd", "all", "other", "linux.rocm.gpu.2"]}\n'
         expected_s3_body = expected_r1 + expected_r2
         expect = gzip.compress(expected_s3_body)
 

From c3a63527b9264fe4d33abd0708f5e56b195c70e8 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:29:52 -0700
Subject: [PATCH 25/38] add tags

---
 .../schema.sql                                | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql

diff --git a/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql b/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql
new file mode 100644
index 0000000000..a310278d35
--- /dev/null
+++ b/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql
@@ -0,0 +1,30 @@
+ -- This table is used to store queue time histogram
+CREATE TABLE misc.oss_ci_queue_time_histogram(
+     -- the type of histogram, currently we store two types of histogram:
+     -- 'in-queue-histogram','completed-queue-histogram'
+    `type` String,
+    `repo` String DEFAULT 'pytorch/pytorch',
+    `workflow_name` String,
+    `job_name` String,
+    `machine_type` String,
+    `histogram_version` String,
+    `histogram` Array(UInt64),
+    `max_queue_time` UInt64,
+    `avg_queue_time` UInt64,
+    `total_count` UInt64,
+    `time` DateTime64(9),
+    `runner_labels` Array(String),
+    `extra_info` Map(String,String)
+)
+ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
+PARTITION BY toYYYYMM(time)
+ORDER BY (
+    type,
+    repo,
+    time,
+    machine_type,
+    job_name,
+    workflow_name,
+)
+TTL toDate(time) + toIntervalYear(5)
+SETTINGS index_granularity = 8192

From 8c24747249167b5561076f9c27da719b21d36ce8 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:33:05 -0700
Subject: [PATCH 26/38] ares

---
 .../schema.sql                                | 30 -------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql

diff --git a/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql b/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql
deleted file mode 100644
index a310278d35..0000000000
--- a/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql
+++ /dev/null
@@ -1,30 +0,0 @@
- -- This table is used to store queue time histogram
-CREATE TABLE misc.oss_ci_queue_time_histogram(
-     -- the type of histogram, currently we store two types of histogram:
-     -- 'in-queue-histogram','completed-queue-histogram'
-    `type` String,
-    `repo` String DEFAULT 'pytorch/pytorch',
-    `workflow_name` String,
-    `job_name` String,
-    `machine_type` String,
-    `histogram_version` String,
-    `histogram` Array(UInt64),
-    `max_queue_time` UInt64,
-    `avg_queue_time` UInt64,
-    `total_count` UInt64,
-    `time` DateTime64(9),
-    `runner_labels` Array(String),
-    `extra_info` Map(String,String)
-)
-ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
-PARTITION BY toYYYYMM(time)
-ORDER BY (
-    type,
-    repo,
-    time,
-    machine_type,
-    job_name,
-    workflow_name,
-)
-TTL toDate(time) + toIntervalYear(5)
-SETTINGS index_granularity = 8192

From a60834e0ea58eaab6b38f02e07887aa5e0b3faa9 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 00:33:32 -0700
Subject: [PATCH 27/38] typo

---
 torchci/pages/api/clickhouse/[queryName].ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchci/pages/api/clickhouse/[queryName].ts b/torchci/pages/api/clickhouse/[queryName].ts
index 01c4f0f51d..e0461e5982 100644
--- a/torchci/pages/api/clickhouse/[queryName].ts
+++ b/torchci/pages/api/clickhouse/[queryName].ts
@@ -10,6 +10,5 @@ export default async function handler(
     queryName,
     JSON.parse(req.query.parameters as string)
   );
-
   res.status(200).json(response);
 }

From 2f472982e14a411a2e7ad911cb447e5a5537a2dc Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:07:29 -0700
Subject: [PATCH 28/38] typo

---
 .../oss_ci_job_queue_time/lambda_function.py  | 58 +++++++++++--------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index c61036ae20..b17b24f2e3 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -472,13 +472,37 @@ def process(self) -> None:
             lf_runner_config_retriever,
             old_lf_lf_runner_config_retriever,
         ) = get_config_retrievers(github_access_token)
-        self.proceses_job_queue_times_historical(
+
+
+        # by default, we use current time as snapshot
+        timestamp = str(int(datetime.now().timestamp()))
+
+        snapshot = self.get_jobs_in_queue_snapshot(
             meta_runner_config_retriever,
             lf_runner_config_retriever,
             old_lf_lf_runner_config_retriever,
+            timestamp,
+            "pytorch/pytorch",
         )
 
-    def snapshot_jobs_in_queue(
+        # TODO(elainewy): add logic to generate histograms based on the snapshot
+        self.output(snapshot, timestamp, "pytorch/pytorch")
+
+    def output(self, snapshot: List[Dict[str, Any]], timestamp: str, repo:str ='pytorch/pytorch') -> None:
+        # key = f"job_queue_times_histogram/{repo}/{timestamp}.txt"
+        if self.is_dry_run:
+            info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}")
+            if self.local_output:
+                file_name = f"job_queue_times_snapshot_{timestamp}.json"
+                info(f"[Dry Run Mode]: local output to {file_name}.json")
+                with open(file_name, "w") as f:
+                    f.write(json.dumps(snapshot))
+
+            info(json.dumps(snapshot))
+            return
+        # upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot)
+
+    def query_queueing_jobs(
         self, timestamp: str = "", repo: str = "pytorch/pytorch"
     ) -> List[Dict[str, Any]]:
         # in given snapshot time, fetches jobs that were in queue but not being picked up by workers
@@ -504,24 +528,20 @@ def snapshot_jobs_in_queue(
         result = jobs_in_queue + jobs_pick
         return result
 
-    def proceses_job_queue_times_historical(
+    def get_jobs_in_queue_snapshot(
         self,
         meta_runner_config_retriever,
         lf_runner_config_retriever,
         old_lf_lf_runner_config_retriever,
-        snapshot_time: str = "",
+        timestamp: str,
         repo: str = "pytorch/pytorch",
-    ) -> None:
-        # by default, we use current time as snapshot
-        timestamp = str(int(datetime.now().timestamp()))
-        if snapshot_time:
-            timestamp = snapshot_time
+    ) -> List[Dict[str, Any]]:
 
-        # fetch jobs in queue at given snapshot time
-        snapshot = self.snapshot_jobs_in_queue(timestamp, repo)
+        # fetches jobs in queue at given snapshot time from db
+        snapshot = self.query_queueing_jobs(timestamp, repo)
         if len(snapshot) == 0:
             info(f"No jobs in queue at time: {timestamp}")
-            return
+            return []
 
         # create dictionary of tags with set of targeting machine types
         lf_runner_config = get_runner_config(lf_runner_config_retriever, timestamp)
@@ -544,19 +564,7 @@ def proceses_job_queue_times_historical(
             job_labels.append(job["machine_type"])
             job["runner_labels"] = job_labels
 
-        key = f"job_queue_times_historical/{repo}/{timestamp}.txt"
-        if self.is_dry_run:
-            info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}")
-            if self.local_output:
-                file_name = f"job_queue_times_historical_snapshot_{timestamp}.json"
-                info(f"[Dry Run Mode]: local output to {file_name}.json")
-                with open(file_name, "w") as f:
-                    f.write(json.dumps(snapshot))
-            else:
-                info(json.dumps(snapshot))
-            return
-
-        upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot)
+        return snapshot
 
     def process_in_queue_jobs(
         self, queryStr: str, parameters: Any

From a6b8113c9b21de880d1bb763bcdba666bef42f3e Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:18:57 -0700
Subject: [PATCH 29/38] typo

---
 .../oss_ci_job_queue_time/lambda_function.py  | 20 +++++++++----------
 .../test_lambda_oss_ci_job_queue_time.py      |  7 +------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index b17b24f2e3..69144e0bd8 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -473,7 +473,6 @@ def process(self) -> None:
             old_lf_lf_runner_config_retriever,
         ) = get_config_retrievers(github_access_token)
 
-
         # by default, we use current time as snapshot
         timestamp = str(int(datetime.now().timestamp()))
 
@@ -485,22 +484,24 @@ def process(self) -> None:
             "pytorch/pytorch",
         )
 
-        # TODO(elainewy): add logic to generate histograms based on the snapshot
-        self.output(snapshot, timestamp, "pytorch/pytorch")
+        self.output_snapshot(snapshot)
+        # TODO(elainewy): add logics to generate histograms based on the snapshot
 
-    def output(self, snapshot: List[Dict[str, Any]], timestamp: str, repo:str ='pytorch/pytorch') -> None:
-        # key = f"job_queue_times_histogram/{repo}/{timestamp}.txt"
+    def output_snapshot(
+        self,
+        snapshot: List[Dict[str, Any]],
+    ) -> None:
         if self.is_dry_run:
-            info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}")
+            info(
+                f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot"
+            )
             if self.local_output:
                 file_name = f"job_queue_times_snapshot_{timestamp}.json"
                 info(f"[Dry Run Mode]: local output to {file_name}.json")
                 with open(file_name, "w") as f:
                     f.write(json.dumps(snapshot))
-
             info(json.dumps(snapshot))
             return
-        # upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot)
 
     def query_queueing_jobs(
         self, timestamp: str = "", repo: str = "pytorch/pytorch"
@@ -523,7 +524,7 @@ def query_queueing_jobs(
         )
 
         info(
-            f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later"
+            f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by runners"
         )
         result = jobs_in_queue + jobs_pick
         return result
@@ -536,7 +537,6 @@ def get_jobs_in_queue_snapshot(
         timestamp: str,
         repo: str = "pytorch/pytorch",
     ) -> List[Dict[str, Any]]:
-
         # fetches jobs in queue at given snapshot time from db
         snapshot = self.query_queueing_jobs(timestamp, repo)
         if len(snapshot) == 0:
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index de7bfd1a93..98711d35c0 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -205,12 +205,7 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(self):
         self.mock_s3_resource.assert_called_once()
         get_mock_s3_resource_object(
             self.mock_s3_resource
-        ).return_value.put.assert_called_once()
-        get_mock_s3_resource_object(
-            self.mock_s3_resource
-        ).return_value.put.assert_called_once_with(
-            Body=expect, ContentEncoding="gzip", ContentType="text/plain"
-        )
+        ).return_value.put.assert_not_called()
 
     def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self):
         test_cases = [

From aa1d08c0136d0f5778c6277793eae9a77026c3d9 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:22:24 -0700
Subject: [PATCH 30/38] typo

---
 .../oss_ci_job_queue_time/lambda_function.py  | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 69144e0bd8..e4ddfe25b9 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -484,24 +484,27 @@ def process(self) -> None:
             "pytorch/pytorch",
         )
 
-        self.output_snapshot(snapshot)
-        # TODO(elainewy): add logics to generate histograms based on the snapshot
+        if self.is_dry_run:
+            self.output_snapshot(snapshot)
+        # TODO(elainewy): add logics to generate histograms based on the snapshot results
 
     def output_snapshot(
         self,
         snapshot: List[Dict[str, Any]],
     ) -> None:
-        if self.is_dry_run:
-            info(
-                f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot"
-            )
-            if self.local_output:
-                file_name = f"job_queue_times_snapshot_{timestamp}.json"
-                info(f"[Dry Run Mode]: local output to {file_name}.json")
-                with open(file_name, "w") as f:
-                    f.write(json.dumps(snapshot))
-            info(json.dumps(snapshot))
-            return
+        """
+        print the snapshot to local file or terminal for local test
+        """
+        info(
+            f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot"
+        )
+        if self.local_output:
+            file_name = f"job_queue_times_snapshot_{timestamp}.json"
+            info(f"[Dry Run Mode]: local output to {file_name}.json")
+            with open(file_name, "w") as f:
+                f.write(json.dumps(snapshot))
+        info(json.dumps(snapshot))
+        return
 
     def query_queueing_jobs(
         self, timestamp: str = "", repo: str = "pytorch/pytorch"

From b8a1086d87d175fb9641c927ec093a44f56ae4ab Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:24:09 -0700
Subject: [PATCH 31/38] typo

---
 aws/lambda/oss_ci_job_queue_time/lambda_function.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index e4ddfe25b9..dbd0e73738 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -484,17 +484,20 @@ def process(self) -> None:
             "pytorch/pytorch",
         )
 
-        if self.is_dry_run:
-            self.output_snapshot(snapshot)
+        self.output_snapshot(snapshot,timestamp)
         # TODO(elainewy): add logics to generate histograms based on the snapshot results
 
     def output_snapshot(
         self,
         snapshot: List[Dict[str, Any]],
+        timestamp: str,
     ) -> None:
         """
-        print the snapshot to local file or terminal for local test
+        print the snapshot to local file or terminal for local test only
         """
+        if not self.is_dry_run:
+            return
+
         info(
             f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot"
         )

From e91f959b8f7369fc71efb29cc540991dcc8dc38c Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:25:36 -0700
Subject: [PATCH 32/38] typo

---
 aws/lambda/oss_ci_job_queue_time/lambda_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index dbd0e73738..8158ed6b91 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -473,7 +473,7 @@ def process(self) -> None:
             old_lf_lf_runner_config_retriever,
         ) = get_config_retrievers(github_access_token)
 
-        # by default, we use current time as snapshot
+        # use current time as snapshot time
         timestamp = str(int(datetime.now().timestamp()))
 
         snapshot = self.get_jobs_in_queue_snapshot(

From 6b3b8890552ccd69eff5d185891ef20931b4780d Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:27:23 -0700
Subject: [PATCH 33/38] typo

---
 aws/lambda/oss_ci_job_queue_time/lambda_function.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 8158ed6b91..91d4d70fe9 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -418,7 +418,6 @@ def get_query_statement_for_queueing_jobs(time: str, repo: str = "pytorch/pytorc
     }
     return query, parameters
 
-
 def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
     auth = Auth.Token(github_access_token)
     test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra")
@@ -440,7 +439,6 @@ def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
         old_lf_lf_runner_config_retriever,
     )
 
-
 class QueueTimeProcessor:
     """
     this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time

From 4d8440fd1efe1f941dfb7573b470c25a8106622f Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:41:35 -0700
Subject: [PATCH 34/38] reform code

---
 .../oss_ci_job_queue_time/lambda_function.py  | 193 +++++++++---------
 1 file changed, 101 insertions(+), 92 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 91d4d70fe9..5c9312caf8 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -22,8 +22,8 @@
 
 
 logging.basicConfig(level=logging.INFO)
-
 _bucket_name = "ossci-raw-job-status"
+
 _in_queue_job_select_statement = """
 SELECT
     DATE_DIFF(
@@ -53,9 +53,11 @@
 
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
+    # for local testing only, disable SSL verification
     # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
+
     return clickhouse_connect.get_client(
-        host=host, user=user, password=password, secure=True, verify=False
+        host=host, user=user, password=password, secure=True
     )
 
 
@@ -345,79 +347,6 @@ def get_runner_config(
     return {"runner_types": {}}
 
 
-def get_query_statement_for_picked_up_job(time: str, repo: str = "pytorch/pytorch"):
-    """
-    this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later
-    """
-    s1 = """
-    WITH possible_queued_jobs AS (
-        SELECT
-            id,
-            run_id,
-            started_at,
-            created_at
-        FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
-        WHERE
-            started_at > ({timestamp:DateTime})
-            AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE)
-            AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
-    )"""
-
-    s2 = """
-    WHERE
-        job.id IN (SELECT id FROM possible_queued_jobs)
-        AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
-        AND workflow.repository.'full_name' = {repo:String}
-        AND job.status = 'completed'
-        AND LENGTH(job.steps) != 0
-        AND workflow.status = 'completed'
-    ORDER BY
-        queue_s DESC
-    """
-    query = s1 + _in_queue_job_select_statement + s2
-    parameters = {
-        "timestamp": time,
-        "repo": repo,
-    }
-    return query, parameters
-
-
-def get_query_statement_for_queueing_jobs(time: str, repo: str = "pytorch/pytorch"):
-    """
-    this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
-    """
-    s1 = """
-    WITH possible_queued_jobs AS (
-        SELECT
-            id,
-            run_id,
-            started_at,
-            created_at
-        FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
-        WHERE
-            status = 'queued'
-            AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE)
-            AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
-    )
-    """
-    s2 = """
-    WHERE
-        job.id IN (SELECT id FROM possible_queued_jobs)
-        AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
-        AND workflow.repository.'full_name' = {repo:String}
-        AND job.status = 'queued'
-        AND LENGTH(job.steps) = 0
-        AND workflow.status != 'completed'
-    ORDER BY
-        queue_s DESC
-    """
-    query = s1 + _in_queue_job_select_statement + s2
-    parameters = {
-        "timestamp": time,
-        "repo": repo,
-    }
-    return query, parameters
-
 def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
     auth = Auth.Token(github_access_token)
     test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra")
@@ -439,6 +368,7 @@ def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
         old_lf_lf_runner_config_retriever,
     )
 
+
 class QueueTimeProcessor:
     """
     this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time
@@ -465,6 +395,7 @@ def process(self) -> None:
         if not github_access_token:
             raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
 
+        # get runner config retrievers
         (
             meta_runner_config_retriever,
             lf_runner_config_retriever,
@@ -474,7 +405,7 @@ def process(self) -> None:
         # use current time as snapshot time
         timestamp = str(int(datetime.now().timestamp()))
 
-        snapshot = self.get_jobs_in_queue_snapshot(
+        snapshot = self.get_queueing_jobs_snapshot(
             meta_runner_config_retriever,
             lf_runner_config_retriever,
             old_lf_lf_runner_config_retriever,
@@ -482,7 +413,7 @@ def process(self) -> None:
             "pytorch/pytorch",
         )
 
-        self.output_snapshot(snapshot,timestamp)
+        self.output_snapshot(snapshot, timestamp)
         # TODO(elainewy): add logics to generate histograms based on the snapshot results
 
     def output_snapshot(
@@ -507,33 +438,33 @@ def output_snapshot(
         info(json.dumps(snapshot))
         return
 
-    def query_queueing_jobs(
+    def _fetch_snapshot_from_db(
         self, timestamp: str = "", repo: str = "pytorch/pytorch"
     ) -> List[Dict[str, Any]]:
         # in given snapshot time, fetches jobs that were in queue but not being picked up by workers
-        queued_query, queued_parameters = get_query_statement_for_queueing_jobs(
+        queued_query, queued_parameters = self.get_query_statement_for_queueing_jobs(
             timestamp, repo
         )
-        jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters)
+        jobs_in_queue = self._query_in_queue_jobs(queued_query, queued_parameters)
 
-        # in queue in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time
-        # this happens when the snapshot time is not latest timestamp
-        picked_query, picked_params = get_query_statement_for_picked_up_job(
+        # in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time
+        # this happens when the snapshot time is not in latest timestamp
+        picked_query, picked_params = self.get_query_statement_for_picked_up_job(
             timestamp, repo
         )
-        jobs_pick = self.process_in_queue_jobs(picked_query, picked_params)
+        jobs_pick = self._query_in_queue_jobs(picked_query, picked_params)
 
         datetime_str = datetime.fromtimestamp(int(timestamp)).strftime(
             "%Y-%m-%d %H:%M:%S"
         )
 
         info(
-            f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by runners"
+            f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by runners later"
         )
         result = jobs_in_queue + jobs_pick
         return result
 
-    def get_jobs_in_queue_snapshot(
+    def get_queueing_jobs_snapshot(
         self,
         meta_runner_config_retriever,
         lf_runner_config_retriever,
@@ -541,8 +472,12 @@ def get_jobs_in_queue_snapshot(
         timestamp: str,
         repo: str = "pytorch/pytorch",
     ) -> List[Dict[str, Any]]:
-        # fetches jobs in queue at given snapshot time from db
-        snapshot = self.query_queueing_jobs(timestamp, repo)
+        """
+        this method is used to fetch jobs that were in queue in given snapshot time
+        """
+
+        # fetches queued jobs at given snapshot time from db
+        snapshot = self._fetch_snapshot_from_db(timestamp, repo)
         if len(snapshot) == 0:
             info(f"No jobs in queue at time: {timestamp}")
             return []
@@ -558,19 +493,17 @@ def get_jobs_in_queue_snapshot(
         )
         update_tags(runner_labels, set([job["machine_type"] for job in snapshot]))
 
-        # iterate throught jobs, and update tags for each job
+        # iterates throught jobs, and update tags for each job
         for job in snapshot:
             job_labels = []
             for tag in runner_labels:
                 if job["machine_type"] in runner_labels[tag]:
                     job_labels.append(tag)
-            # add job's own machine type to runner labels
-            job_labels.append(job["machine_type"])
             job["runner_labels"] = job_labels
 
         return snapshot
 
-    def process_in_queue_jobs(
+    def _query_in_queue_jobs(
         self, queryStr: str, parameters: Any
     ) -> list[dict[str, Any]]:
         """
@@ -607,6 +540,82 @@ def _to_query_result_dict(
             li.append(record)
         return li
 
+    def get_query_statement_for_picked_up_job(
+        self, time: str, repo: str = "pytorch/pytorch"
+    ):
+        """
+        this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later
+        """
+        s1 = """
+        WITH possible_queued_jobs AS (
+            SELECT
+                id,
+                run_id,
+                started_at,
+                created_at
+            FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
+            WHERE
+                started_at > ({timestamp:DateTime})
+                AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE)
+                AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
+        )"""
+
+        s2 = """
+        WHERE
+            job.id IN (SELECT id FROM possible_queued_jobs)
+            AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
+            AND workflow.repository.'full_name' = {repo:String}
+            AND job.status = 'completed'
+            AND LENGTH(job.steps) != 0
+            AND workflow.status = 'completed'
+        ORDER BY
+            queue_s DESC
+        """
+        query = s1 + _in_queue_job_select_statement + s2
+        parameters = {
+            "timestamp": time,
+            "repo": repo,
+        }
+        return query, parameters
+
+    def get_query_statement_for_queueing_jobs(
+        self, time: str, repo: str = "pytorch/pytorch"
+    ):
+        """
+        this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
+        """
+        s1 = """
+        WITH possible_queued_jobs AS (
+            SELECT
+                id,
+                run_id,
+                started_at,
+                created_at
+            FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
+            WHERE
+                status = 'queued'
+                AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE)
+                AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK)
+        )
+        """
+        s2 = """
+        WHERE
+            job.id IN (SELECT id FROM possible_queued_jobs)
+            AND workflow.id IN (SELECT run_id FROM possible_queued_jobs)
+            AND workflow.repository.'full_name' = {repo:String}
+            AND job.status = 'queued'
+            AND LENGTH(job.steps) = 0
+            AND workflow.status != 'completed'
+        ORDER BY
+            queue_s DESC
+        """
+        query = s1 + _in_queue_job_select_statement + s2
+        parameters = {
+            "timestamp": time,
+            "repo": repo,
+        }
+        return query, parameters
+
 
 def lambda_handler(event: Any, context: Any) -> None:
     """

From 3698aa62790f9744c2c4ce479a4087f2bc22f8d2 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 02:43:37 -0700
Subject: [PATCH 35/38] comment

---
 aws/lambda/oss_ci_job_queue_time/lambda_function.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 5c9312caf8..df0da3f505 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -6,7 +6,6 @@
 import os
 import gzip
 import threading
-import dateutil.parser
 import yaml
 
 import boto3  # type: ignore[import]
@@ -54,7 +53,7 @@
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
     # for local testing only, disable SSL verification
-    # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
+    # return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
 
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True
@@ -182,7 +181,7 @@ def _fetch_until_timestamp(self, timestamp: datetime) -> Optional[str]:
 
         return self._find_earliest_after_in_cache(timestamp)
 
-    def _fetch_content_for_commit(self, commit: any) -> str:
+    def _fetch_content_for_commit(self, commit: Any) -> str:
         if commit.sha not in self._content_cache:
             print(
                 f"Fetching content for {self.repo} : {self.path} at {commit.commit.author.date} - {commit.sha}"

From 1d9c8475d47146b5005c33da3c51fbc54fb304ac Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 15:46:25 -0700
Subject: [PATCH 36/38] comment

---
 .../oss_ci_job_queue_time/lambda_function.py  | 56 ++++++++++++-------
 .../test_lambda_oss_ci_job_queue_time.py      | 33 ++++++-----
 2 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index df0da3f505..9e435e5ebb 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -19,10 +19,16 @@
 from github import Github, Auth
 from dateutil.parser import parse
 
+ENVS = {
+    "GITHUB_ACCESS_TOKEN": os.getenv("GITHUB_ACCESS_TOKEN", ""),
+    "CLICKHOUSE_ENDPOINT": os.getenv("CLICKHOUSE_ENDPOINT", ""),
+    "CLICKHOUSE_PASSWORD": os.getenv("CLICKHOUSE_PASSWORD", ""),
+    "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME,"),
+}
+
 
 logging.basicConfig(level=logging.INFO)
 _bucket_name = "ossci-raw-job-status"
-
 _in_queue_job_select_statement = """
 SELECT
     DATE_DIFF(
@@ -66,14 +72,15 @@ def get_aws_s3_resource() -> Any:
 
 
 def get_clickhouse_client_environment() -> Any:
-    for env in ["CLICKHOUSE_ENDPOINT", "CLICKHOUSE_USERNAME", "CLICKHOUSE_PASSWORD"]:
-        if not os.getenv(env):
-            raise ValueError(f"Missing environment variable {env}")
+    info(f"Getting environment variables {ENVS}")
+    for name, env_val in ENVS.items():
+        if not env_val:
+            raise ValueError(f"Missing environment variable {name}")
 
     return get_clickhouse_client(
-        host=os.getenv("CLICKHOUSE_ENDPOINT"),
-        user=os.getenv("CLICKHOUSE_USERNAME"),
-        password=os.getenv("CLICKHOUSE_PASSWORD"),
+        host=ENVS["CLICKHOUSE_ENDPOINT"],
+        user=ENVS["CLICKHOUSE_USERNAME"],
+        password=ENVS["CLICKHOUSE_PASSWORD"],
     )
 
 
@@ -381,6 +388,7 @@ def __init__(
         self,
         clickhouse_client: Any,
         s3_client: Any,
+        github_access_token: str = "",
         is_dry_run: bool = False,
         local_output: bool = False,
     ) -> None:
@@ -389,17 +397,17 @@ def __init__(
         self.is_dry_run = is_dry_run
         self.local_output = local_output and is_dry_run
 
-    def process(self) -> None:
-        github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "")
         if not github_access_token:
             raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
+        self.github_access_token = github_access_token
 
+    def process(self) -> None:
         # get runner config retrievers
         (
             meta_runner_config_retriever,
             lf_runner_config_retriever,
             old_lf_lf_runner_config_retriever,
-        ) = get_config_retrievers(github_access_token)
+        ) = get_config_retrievers(self.github_access_token)
 
         # use current time as snapshot time
         timestamp = str(int(datetime.now().timestamp()))
@@ -623,7 +631,9 @@ def lambda_handler(event: Any, context: Any) -> None:
     db_client = get_clickhouse_client_environment()
     s3_client = get_aws_s3_resource()
 
-    QueueTimeProcessor(db_client, s3_client).process()
+    QueueTimeProcessor(
+        db_client, s3_client, github_access_token=ENVS["GITHUB_ACCESS_TOKEN"]
+    ).process()
 
     return
 
@@ -635,26 +645,26 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--clickhouse-endpoint",
-        default=os.getenv("CLICKHOUSE_ENDPOINT", ""),
+        default=ENVS["CLICKHOUSE_ENDPOINT"],
         type=str,
         help="the clickhouse endpoint, the clickhouse_endpoint name is  https://{clickhouse_endpoint}:{port} for full url ",
     )
     parser.add_argument(
         "--clickhouse-username",
         type=str,
-        default=os.getenv("CLICKHOUSE_USERNAME", ""),
+        default=ENVS["CLICKHOUSE_USERNAME"],
         help="the clickhouse username",
     )
     parser.add_argument(
         "--clickhouse-password",
         type=str,
-        default=os.getenv("CLICKHOUSE_PASSWORD", ""),
+        default=ENVS["CLICKHOUSE_PASSWORD"],
         help="the clickhouse password for the user name",
     )
     parser.add_argument(
         "--github-access-token",
         type=str,
-        default=os.getenv("GITHUB_ACCESS_TOKEN", ""),
+        default=ENVS["GITHUB_ACCESS_TOKEN"],
         help="the github access token to access github api",
     )
     parser.add_argument(
@@ -679,19 +689,23 @@ def main() -> None:
     arguments = parse_args()
 
     # update environment variables for input parameters
-    os.environ["CLICKHOUSE_ENDPOINT"] = arguments.clickhouse_endpoint
-    os.environ["CLICKHOUSE_USERNAME"] = arguments.clickhouse_username
-    os.environ["CLICKHOUSE_PASSWORD"] = arguments.clickhouse_password
-    os.environ["GITHUB_ACCESS_TOKEN"] = arguments.github_access_token
 
-    db_client = get_clickhouse_client_environment()
+    db_client = get_clickhouse_client(
+        host=arguments.clickhouse_endpoint,
+        user=arguments.clickhouse_username,
+        password=arguments.clickhouse_password,
+    )
     s3_client = get_aws_s3_resource()
 
     # always run in dry-run mode in local environment, unless it's disabled.
     is_dry_run = not arguments.not_dry_run
 
     QueueTimeProcessor(
-        db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output
+        db_client,
+        s3_client,
+        arguments.github_access_token,
+        is_dry_run=is_dry_run,
+        local_output=arguments.local_output,
     ).process()
 
 
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index 98711d35c0..25121ea7fc 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -138,11 +138,13 @@ def mock_db_client(
     )
 
 
-def set_default_env_variables():
-    os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1"
-    os.environ["CLICKHOUSE_USERNAME"] = "user1"
-    os.environ["CLICKHOUSE_PASSWORD"] = "pwd1"
-    os.environ["GITHUB_ACCESS_TOKEN"] = "token1"
+def get_default_environment_variables():
+    return {
+        "CLICKHOUSE_ENDPOINT": "test",
+        "CLICKHOUSE_USERNAME": "test",
+        "CLICKHOUSE_PASSWORD": "test",
+        "GITHUB_ACCESS_TOKEN": "test",
+    }
 
 
 class Test(unittest.TestCase):
@@ -151,11 +153,16 @@ def setUp(self):
         patcher2 = patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client")
         patcher3 = patch("oss_ci_job_queue_time.lambda_function.get_runner_config")
         patcher4 = patch("oss_ci_job_queue_time.lambda_function.get_config_retrievers")
+        envs_patcher = patch(
+            "oss_ci_job_queue_time.lambda_function.ENVS",
+            new=get_default_environment_variables(),
+        )
 
         self.mock_s3_resource = patcher1.start()
         self.mock_get_client = patcher2.start()
         self.mock_get_runner_config = patcher3.start()
         self.mock_get_config_retrievers = patcher4.start()
+        self.mock_envs = envs_patcher.start()
 
         self.mock_get_runner_config.return_value = {"runner_types": {}}
         self.mock_get_config_retrievers.return_value = ({}, {}, {})
@@ -164,11 +171,11 @@ def setUp(self):
         self.addCleanup(patcher2.stop)
         self.addCleanup(patcher3.stop)
         self.addCleanup(patcher4.stop)
+        self.addCleanup(envs_patcher.stop)
 
     def test_lambda_handler_when_row_result_is_empty(self):
         print("test_lambda_handler_when_row_result_is_empty ")
         # prepare
-        set_default_env_variables()
         mock_s3_resource_put(self.mock_s3_resource)
         mock_db_client(self.mock_get_client, [], [])
 
@@ -183,7 +190,6 @@ def test_lambda_handler_when_row_result_is_empty(self):
 
     def test_lambda_handler_when_lambda_happy_flow_then_success(self):
         # prepare
-        set_default_env_variables()
         mock_s3_resource_put(self.mock_s3_resource)
         mock_db_client(self.mock_get_client)
 
@@ -214,15 +220,12 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self):
             ("CLICKHOUSE_PASSWORD"),
             ("GITHUB_ACCESS_TOKEN"),
         ]
-
         for x in test_cases:
-            with self.subTest(x=x):
+            with self.subTest(f"Test Environment {x}", x=x):
                 # prepare
                 self.mock_get_client.reset_mock(return_value=True)
                 self.mock_s3_resource.reset_mock(return_value=True)
-
-                set_default_env_variables()
-                os.environ[x] = ""
+                self.mock_envs[x] = ""
 
                 # execute
                 with self.assertRaises(ValueError) as context:
@@ -235,11 +238,15 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self):
                     self.mock_s3_resource
                 ).return_value.put.assert_not_called()
 
+
+                # reset
+                # manually reset the envs, todo: find a better way to do this,maybe use parameterized
+                self.mock_envs[x] = get_default_environment_variables()[x]
+
     def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_write(
         self,
     ):
         # prepare
-        set_default_env_variables()
         mock_s3_resource_put(self.mock_s3_resource)
         mock_db_client(self.mock_get_client)
 

From dcc68fd365e232565c35e71670893c84f221675a Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 15:59:09 -0700
Subject: [PATCH 37/38] comment

---
 .../oss_ci_job_queue_time/lambda_function.py  | 59 ++++++++++---------
 .../test_lambda_oss_ci_job_queue_time.py      |  7 ++-
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index 9e435e5ebb..e98898af52 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import gzip
+import re
 import threading
 import yaml
 
@@ -59,7 +60,7 @@
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
     # for local testing only, disable SSL verification
-    # return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
+    #return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
 
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True
@@ -353,7 +354,7 @@ def get_runner_config(
     return {"runner_types": {}}
 
 
-def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
+def get_config_retrievers(github_access_token: str) -> Dict[str, LazyFileHistory]:
     auth = Auth.Token(github_access_token)
     test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra")
     pytorch_repo = Github(auth=auth).get_repo("pytorch/pytorch")
@@ -368,11 +369,11 @@ def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]:
         pytorch_repo, ".github/lf-scale-config.yml"
     )
 
-    return (
-        meta_runner_config_retriever,
-        lf_runner_config_retriever,
-        old_lf_lf_runner_config_retriever,
-    )
+    return {
+        "meta": meta_runner_config_retriever,
+        "lf": lf_runner_config_retriever,
+        "old_lf": old_lf_lf_runner_config_retriever,
+    }
 
 
 class QueueTimeProcessor:
@@ -403,24 +404,21 @@ def __init__(
 
     def process(self) -> None:
         # get runner config retrievers
-        (
-            meta_runner_config_retriever,
-            lf_runner_config_retriever,
-            old_lf_lf_runner_config_retriever,
-        ) = get_config_retrievers(self.github_access_token)
+        retrievers = get_config_retrievers(self.github_access_token)
 
         # use current time as snapshot time
         timestamp = str(int(datetime.now().timestamp()))
 
         snapshot = self.get_queueing_jobs_snapshot(
-            meta_runner_config_retriever,
-            lf_runner_config_retriever,
-            old_lf_lf_runner_config_retriever,
+            retrievers["meta"],
+            retrievers["lf"],
+            retrievers["old_lf"],
             timestamp,
             "pytorch/pytorch",
         )
 
-        self.output_snapshot(snapshot, timestamp)
+        if self.is_dry_run:
+            self.output_snapshot(snapshot, timestamp)
         # TODO(elainewy): add logics to generate histograms based on the snapshot results
 
     def output_snapshot(
@@ -431,9 +429,6 @@ def output_snapshot(
         """
         print the snapshot to local file or terminal for local test only
         """
-        if not self.is_dry_run:
-            return
-
         info(
             f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot"
         )
@@ -442,24 +437,24 @@ def output_snapshot(
             info(f"[Dry Run Mode]: local output to {file_name}.json")
             with open(file_name, "w") as f:
                 f.write(json.dumps(snapshot))
+            return
         info(json.dumps(snapshot))
-        return
 
     def _fetch_snapshot_from_db(
         self, timestamp: str = "", repo: str = "pytorch/pytorch"
     ) -> List[Dict[str, Any]]:
         # in given snapshot time, fetches jobs that were in queue but not being picked up by workers
-        queued_query, queued_parameters = self.get_query_statement_for_queueing_jobs(
-            timestamp, repo
+        queued_query = self.get_query_statement_for_queueing_jobs(timestamp, repo)
+        jobs_in_queue = self._query_in_queue_jobs(
+            queued_query["query"], queued_query["parameters"]
         )
-        jobs_in_queue = self._query_in_queue_jobs(queued_query, queued_parameters)
 
         # in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time
         # this happens when the snapshot time is not in latest timestamp
-        picked_query, picked_params = self.get_query_statement_for_picked_up_job(
-            timestamp, repo
+        picked_query = self.get_query_statement_for_picked_up_job(timestamp, repo)
+        jobs_pick = self._query_in_queue_jobs(
+            picked_query["query"], picked_query["parameters"]
         )
-        jobs_pick = self._query_in_queue_jobs(picked_query, picked_params)
 
         datetime_str = datetime.fromtimestamp(int(timestamp)).strftime(
             "%Y-%m-%d %H:%M:%S"
@@ -583,11 +578,14 @@ def get_query_statement_for_picked_up_job(
             "timestamp": time,
             "repo": repo,
         }
-        return query, parameters
+        return {
+            "query": query,
+            "parameters": parameters,
+        }
 
     def get_query_statement_for_queueing_jobs(
         self, time: str, repo: str = "pytorch/pytorch"
-    ):
+    ) -> Dict[str, Any]:
         """
         this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers
         """
@@ -621,7 +619,10 @@ def get_query_statement_for_queueing_jobs(
             "timestamp": time,
             "repo": repo,
         }
-        return query, parameters
+        return {
+            "query": query,
+            "parameters": parameters,
+        }
 
 
 def lambda_handler(event: Any, context: Any) -> None:
diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
index 25121ea7fc..f55583f6fd 100644
--- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
+++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py
@@ -165,7 +165,11 @@ def setUp(self):
         self.mock_envs = envs_patcher.start()
 
         self.mock_get_runner_config.return_value = {"runner_types": {}}
-        self.mock_get_config_retrievers.return_value = ({}, {}, {})
+        self.mock_get_config_retrievers.return_value = {
+            "meta": MagicMock(),
+            "lf": MagicMock(),
+            "old_lf": MagicMock(),
+        }
 
         self.addCleanup(patcher1.stop)  # Ensure patchers stop after each test
         self.addCleanup(patcher2.stop)
@@ -238,7 +242,6 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self):
                     self.mock_s3_resource
                 ).return_value.put.assert_not_called()
 
-
                 # reset
                 # manually reset the envs, todo: find a better way to do this,maybe use parameterized
                 self.mock_envs[x] = get_default_environment_variables()[x]

From f32924cf13e44137cf63c238485bfe6178fde84c Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Wed, 19 Mar 2025 16:11:41 -0700
Subject: [PATCH 38/38] comment

---
 .../oss_ci_job_queue_time/lambda_function.py  | 56 +++++++++++++++++--
 1 file changed, 51 insertions(+), 5 deletions(-)

diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
index e98898af52..a94d1ba537 100644
--- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py
+++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py
@@ -60,7 +60,7 @@
 @lru_cache()
 def get_clickhouse_client(host: str, user: str, password: str) -> Any:
     # for local testing only, disable SSL verification
-    #return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
+    # return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False)
 
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True
@@ -85,6 +85,32 @@ def get_clickhouse_client_environment() -> Any:
     )
 
 
+def write_to_file(data: Any, filename="", path=""):
+    """
+    Writes data to a specified file. If no path is provided, writes to the current directory.
+
+    :param data: The content to write to the file.
+    :param filename: The name of the file (default: 'output.txt').
+    :param path: The directory where the file should be saved (default: current directory).
+    """
+
+    if not filename:
+        filename = "output_snapshot.json"
+    if not path:
+        path = "."
+
+    # Ensure the path exists
+    os.makedirs(path, exist_ok=True)
+
+    # Construct full file path
+    file_path = os.path.join(path, filename)
+
+    # Write data to file
+    with open(file_path, "w", encoding="utf-8") as file:
+        file.write(data)
+    print(f"File written to: {os.path.abspath(file_path)}")
+
+
 def upload_to_s3_txt(
     s3_client: Any,
     bucket_name: str,
@@ -392,12 +418,17 @@ def __init__(
         github_access_token: str = "",
         is_dry_run: bool = False,
         local_output: bool = False,
+        output_snapshot_file_name: str = "job_queue_times_snapshot",
+        output_snapshot_file_path: str = "",
     ) -> None:
         self.clickhouse_client = clickhouse_client
         self.s3_client = s3_client
         self.is_dry_run = is_dry_run
         self.local_output = local_output and is_dry_run
 
+        self.output_snapshot_file_name = output_snapshot_file_name
+        self.output_snapshot_file_path = output_snapshot_file_path
+
         if not github_access_token:
             raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN")
         self.github_access_token = github_access_token
@@ -433,10 +464,11 @@ def output_snapshot(
             f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot"
         )
         if self.local_output:
-            file_name = f"job_queue_times_snapshot_{timestamp}.json"
-            info(f"[Dry Run Mode]: local output to {file_name}.json")
-            with open(file_name, "w") as f:
-                f.write(json.dumps(snapshot))
+            write_to_file(
+                json.dumps(snapshot),
+                self.output_snapshot_file_name,
+                self.output_snapshot_file_path,
+            )
             return
         info(json.dumps(snapshot))
 
@@ -678,6 +710,18 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="when set, writing results to s3 from local environment. By default, we run in dry-run mode for local environment",
     )
+    parser.add_argument(
+        "--output-file-name",
+        type=str,
+        default="job_queue_times_snapshot.json",
+        help="the name of output file for local environment. this is only used for local test environment when local-output is enabled",
+    )
+    parser.add_argument(
+        "--output-file-path",
+        type=str,
+        default="",
+        help="the path of output file for local environment. this is only used for local test environment when local-output is enabled",
+    )
     args, _ = parser.parse_known_args()
     return args
 
@@ -707,6 +751,8 @@ def main() -> None:
         arguments.github_access_token,
         is_dry_run=is_dry_run,
         local_output=arguments.local_output,
+        output_snapshot_file_name=arguments.output_file_name,
+        output_snapshot_file_path=arguments.output_file_path,
     ).process()