From 1da908892be2db1fbb19a25b6b0c84641c7868ad Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 20:57:44 -0700 Subject: [PATCH 01/38] add time --- aws/lambda/oss-ci-job-queue-time/.gitignore | 3 + aws/lambda/oss-ci-job-queue-time/Makefile | 19 ++ .../oss_ci_job_queue_time.py | 237 ++++++++++++++++++ .../oss-ci-job-queue-time/requirements.txt | 2 + .../test_oss_ci_job_queue_time.py | 231 +++++++++++++++++ 5 files changed, 492 insertions(+) create mode 100644 aws/lambda/oss-ci-job-queue-time/.gitignore create mode 100644 aws/lambda/oss-ci-job-queue-time/Makefile create mode 100644 aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py create mode 100644 aws/lambda/oss-ci-job-queue-time/requirements.txt create mode 100644 aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py diff --git a/aws/lambda/oss-ci-job-queue-time/.gitignore b/aws/lambda/oss-ci-job-queue-time/.gitignore new file mode 100644 index 0000000000..bd92f6376a --- /dev/null +++ b/aws/lambda/oss-ci-job-queue-time/.gitignore @@ -0,0 +1,3 @@ +*.zip +deployment/ +venv/ diff --git a/aws/lambda/oss-ci-job-queue-time/Makefile b/aws/lambda/oss-ci-job-queue-time/Makefile new file mode 100644 index 0000000000..ce75d870cc --- /dev/null +++ b/aws/lambda/oss-ci-job-queue-time/Makefile @@ -0,0 +1,19 @@ +all: run-local + +clean: + rm -rf deployment + rm -rf venv + rm -rf deployment.zip + +venv/bin/python: + virtualenv venv + venv/bin/pip install -r requirements.txt + +deployment.zip: + mkdir -p deployment + cp oss_ci_job_queue_time.py ./deployment/. + pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade + cd ./deployment && zip -q -r ../deployment.zip . + +.PHONY: create-deployment-package +create-deployment-package: deployment.zip diff --git a/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py new file mode 100644 index 0000000000..3aed64ed5c --- /dev/null +++ b/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py @@ -0,0 +1,237 @@ +from functools import lru_cache +import json +from typing import Any +import clickhouse_connect +import os +import boto3 +import argparse +from logging import info +import logging +import io +import gzip + +logging.basicConfig(level=logging.INFO) + +CLICKHOUSE_ENDPOINT = os.getenv("CLICKHOUSE_ENDPOINT", "") +CLICKHOUSE_USERNAME = os.getenv("CLICKHOUSE_USERNAME", "default") +CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "") + + +@lru_cache() +def get_clickhouse_client(host: str, user: str, password: str) -> Any: + return clickhouse_connect.get_client( + host=host, user=user, password=password, secure=True + ) + + +@lru_cache() +def get_aws_s3_resource() -> Any: + return boto3.resource("s3") + + +def get_clickhouse_client_handler() -> Any: + for env in ["CLICKHOUSE_ENDPOINT", "CLICKHOUSE_USERNAME", "CLICKHOUSE_PASSWORD"]: + if not os.getenv(env): + raise ValueError(f"Missing environment variable {env}") + + return get_clickhouse_client( + host=CLICKHOUSE_ENDPOINT, user=CLICKHOUSE_USERNAME, password=CLICKHOUSE_PASSWORD + ) + + +def upload_to_s3_txt( + s3_client: Any, + bucket_name: str, + key: str, + records: list[dict[str, Any]], +) -> None: + info(f"Writing {len(records)} documents to S3 {bucket_name}/{key}") + body = io.StringIO() + for record in records: + json.dump(record, body) + body.write("\n") + + s3_client.Object( + f"{bucket_name}", + f"{key}", + ).put( + Body=gzip.compress(body.getvalue().encode()), + ContentEncoding="gzip", + ContentType="text/plain", + ) + info(f"Done! Finish writing document to S3 {bucket_name}/{key} ") + + +def query_in_queue_jobs_now() -> str: + query = """ + WITH possible_queued_jobs AS ( + SELECT + id, + run_id + FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed + WHERE + status = 'queued' + AND created_at < (CURRENT_TIMESTAMP() - INTERVAL 5 MINUTE) + AND created_at > (CURRENT_TIMESTAMP() - INTERVAL 1 WEEK) + ) + SELECT + DATE_DIFF( + 'second', + job.created_at, + CURRENT_TIMESTAMP() + ) AS queue_s, + workflow.repository.'full_name' AS repo, + workflow.name AS workflow_name, + job.name AS job_name, + job.html_url, + IF( + LENGTH(job.labels) = 0, + 'N/A', + IF( + LENGTH(job.labels) > 1, + job.labels[2], + job.labels[1] + ) + ) AS machine_type, + toUnixTimestamp(CURRENT_TIMESTAMP()) AS time + FROM + default.workflow_job job FINAL + JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id + WHERE + job.id IN (SELECT id FROM possible_queued_jobs) + AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) + AND workflow.repository.'full_name' = 'pytorch/pytorch' + AND job.status = 'queued' + AND LENGTH(job.steps) = 0 + AND workflow.status != 'completed' + ORDER BY + queue_s DESC """ + return query + + +class QueueTimeProcessor: + """ + this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time + + To run the main method: + processor = QueueTimeProcessor(clickhouse_client,s3_client) + processor.process() + """ + + def __init__( + self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False + ) -> None: + self.clickhouse_client = clickhouse_client + self.s3_client = s3_client + self.is_dry_run = is_dry_run + + def process(self) -> None: + self.proceses_job_queue_times_historical() + + def proceses_job_queue_times_historical(self) -> None: + jobs_in_queue = self.get_jobs_in_queue_now() + + if len(jobs_in_queue) == 0: + info("No jobs in queue now, skipping writing to s3") + return + + info(f"Found {len(jobs_in_queue)} jobs in queue now") + info(f"Peeking data: {jobs_in_queue[0]}") + + bucket_name = "ossci-raw-job-status" + repo = jobs_in_queue[0]["repo"] + time = jobs_in_queue[0]["time"] + + key = f"job_queue_times_historical/{repo}/{time}.txt" + + if self.is_dry_run: + info( + f"[Dry Run Mode]: {len(jobs_in_queue)} records to S3 {bucket_name}/{key}" + ) + info(json.dumps(jobs_in_queue, indent=4)) + return + + upload_to_s3_txt(self.s3_client, bucket_name, key, jobs_in_queue) + + def get_jobs_in_queue_now(self) -> list[dict[str, Any]]: + reader = self.clickhouse_client.query(query_in_queue_jobs_now()) + # clickhouse returns a generator to return column names and rows + # see https://clickhouse.com/docs/integrations/python#the-queryresult-object + column_names = reader.column_names + rows = reader.result_rows + res = self._to_query_result_dict(rows, column_names) + return res + + def _to_query_result_dict( + self, rows: list[Any], column_names: list[str] + ) -> list[dict[str, Any]]: + li = [] + for row in rows: + record = {} + for idx, name in enumerate(column_names): + record[name] = row[idx] + li.append(record) + return li + + +def lambda_handler(event: Any, context: Any) -> None: + """ + Main method to run in aws lambda environment + """ + db_client = get_clickhouse_client_handler() + s3_client = get_aws_s3_resource() + + QueueTimeProcessor(db_client, s3_client).process() + + return + + +def parse_args() -> argparse.Namespace: + """ + Parse command line arguments, this is mainly used for local test environment. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--clickhouse_endpoint", + type=str, + required=True, + help="the clickhouse endpoint, the clickhouse_endpoint name is https://{clickhouse_endpoint}:{port} for full url ", + ) + parser.add_argument( + "--clickhouse_username", type=str, required=True, help="the clickhouse username" + ) + parser.add_argument( + "--clickhouse_password", + type=str, + required=True, + help="the clickhouse password for the user name", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="when set true, only print instead of writing results to s3", + ) + return parser.parse_known_args()[0] + + +def main(*args, **kwargs) -> None: + """ + Main method to run in local test environment + """ + args = parse_args() + + db_client = get_clickhouse_client( + host=args.clickhouse_endpoint, + user=args.clickhouse_username, + password=args.clickhouse_password, + ) + s3_client = get_aws_s3_resource() + + # process the queue time events + QueueTimeProcessor(db_client, s3_client, is_dry_run=True).process() + + return + + +if __name__ == "__main__": + main() diff --git a/aws/lambda/oss-ci-job-queue-time/requirements.txt b/aws/lambda/oss-ci-job-queue-time/requirements.txt new file mode 100644 index 0000000000..7a4ec20f9f --- /dev/null +++ b/aws/lambda/oss-ci-job-queue-time/requirements.txt @@ -0,0 +1,2 @@ +clickhouse_connect==0.8.5 +pytest==7.4.0 diff --git a/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py new file mode 100644 index 0000000000..c50fae5a90 --- /dev/null +++ b/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py @@ -0,0 +1,231 @@ +import unittest +import os +import json +from re import M, T +from typing import Any, Dict, List, Tuple +from unittest import mock +from unittest.mock import MagicMock, patch +from oss_ci_job_queue_time import ( + lambda_handler, + get_aws_s3_resource, + get_clickhouse_client, +) +import gzip + + +def get_default_result_rows(test_sample: str = "0"): + """ + generate result rows for testing, this corrresponds to the following columns: + 'queue_s', 'repo', 'workflow_name', 'job_name', 'html_url', 'machine_type', 'time' + """ + match test_sample: + case "0": + return [ + ( + 60000, + "pytorch/pytorch", + "workflow-name-1", + "job-name-1", + "runs/1/job/1", + "linux.aws.h100", + 1742262372, + ), + ( + 1400, + "pytorch/pytorch", + "workflow-name-2", + "job-name-2", + "runs/2/job/2", + "linux.rocm.gpu.2", + 1742262372, + ), + ] + case "1": + return [ + ( + 60000, + "pytorch/pytorch", + "inductor-h100", + "test1 (h100, 5, 5, linux.aws.h100)", + "runs/1/job/1", + "linux.aws.h100", + 1742262372, + ), + ( + 50000, + "pytorch/pytorch", + "inductor-h100", + "test1 (h100, 5, 5, linux.aws.h100)", + "runs/1/job/2", + "linux.aws.h100", + 1742262372, + ), + ( + 55000, + "pytorch/pytorch", + "inductor-h100", + "test1 (h100, 2, 6, linux.aws.h100)", + "runs/1/job/3", + "linux.aws.h100", + 1742262372, + ), + ( + 1729, + "pytorch/pytorch", + "inductor-h100", + "test2 (h100, 1, 1, linux.aws.h100)", + "runs/2/job/1", + "linux.aws.h100", + 1742262372, + ), + ( + 1352, + "pytorch/pytorch", + "inductor-rocm", + "rocm-test1(1, 1, linux.rocm.gpu.2)", + "runs/3/job/1", + "linux.rocm.gpu.2", + 1742262372, + ), + ( + 1400, + "pytorch/pytorch", + "inductor-rocm", + "rocm-test1 (1, 1, linux.rocm.gpu.2)", + "runs/4/job/2", + "linux.rocm.gpu.2", + 1742262372, + ), + ] + case _: + return [] + + +def get_default_result_columns() -> Tuple: + return ( + "queue_s", + "repo", + "workflow_name", + "job_name", + "html_url", + "machine_type", + "time", + ) + + +def mock_s3_resource_put(mock_s3_resource: Any) -> None: + mock_s3 = mock_s3_resource.return_value + mock_object = mock_s3.Object.return_value + mock_object.put.return_value = {"ResponseMetadata": {"HTTPStatusCode": 200}} + + +def get_mock_s3_resource_object(mock_s3_resource: Any): + return mock_s3_resource.return_value.Object + + +def mock_db_client( + mock: Any, + result_rows: List[Tuple] = get_default_result_rows(), + result_columns: Tuple = get_default_result_columns(), +) -> None: + mock_client = mock.return_value + mock_client.query.return_value.result_rows = result_rows + mock_client.query.return_value.column_names = result_columns + + +def setEnvironmentVariables(): + os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1" + os.environ["CLICKHOUSE_USERNAME"] = "user1" + os.environ["CLICKHOUSE_PASSWORD"] = "pwd1" + + +class Test(unittest.TestCase): + @patch("oss_ci_job_queue_time.get_aws_s3_resource") + @patch("oss_ci_job_queue_time.get_clickhouse_client") + def test_lambda_handler_when_row_result_is_empty( + self, mock_get_client, mock_s3_resource + ): + print("test_lambda_handler_when_row_result_is_empty ") + # prepare + setEnvironmentVariables() + mock_s3_resource_put(mock_s3_resource) + mock_db_client(mock_get_client, result_rows=[]) + + # execute + lambda_handler(None, None) + + # assert + mock_get_client.assert_called_once() + get_mock_s3_resource_object( + mock_s3_resource + ).return_value.put.assert_not_called() + + @patch("oss_ci_job_queue_time.get_aws_s3_resource") + @patch("oss_ci_job_queue_time.get_clickhouse_client") + def test_lambda_handler_when_lambda_happy_flow_then_success( + self, mock_get_client, mock_s3_resource + ): + # prepare + setEnvironmentVariables() + mock_s3_resource_put(mock_s3_resource) + mock_db_client(mock_get_client) + + expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372}\n' + expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372}\n' + expected_s3_body = expected_r1 + expected_r2 + expect = gzip.compress(expected_s3_body) + + # execute + lambda_handler(None, None) + + # assert + + # assert clickhouse client + mock_get_client.assert_called_once() + mock_get_client.return_value.query.assert_called_once() + + # assert s3 resource + mock_s3_resource.assert_called_once() + get_mock_s3_resource_object( + mock_s3_resource + ).return_value.put.assert_called_once() + get_mock_s3_resource_object( + mock_s3_resource + ).return_value.put.assert_called_once_with( + Body=expect, ContentEncoding="gzip", ContentType="text/plain" + ) + + @patch("boto3.resource") + @patch("clickhouse_connect.get_client") + def test_lambda_handler_when_missing_required_env_vars_then_throws_error( + self, mock_get_client, mock_s3_resource + ): + test_cases = [ + ("CLICKHOUSE_ENDPOINT"), + ("CLICKHOUSE_USERNAME"), + ("CLICKHOUSE_PASSWORD"), + ] + + for x in test_cases: + with self.subTest(x=x): + # prepare + mock_get_client.reset_mock(return_value=True) + mock_s3_resource.reset_mock(return_value=True) + + setEnvironmentVariables() + os.environ[x] = "" + + # execute + with self.assertRaises(ValueError) as context: + _ = lambda_handler(None, None) + + # assert + self.assertTrue(x in str(context.exception)) + mock_get_client.return_value.query.assert_not_called() + get_mock_s3_resource_object( + mock_s3_resource + ).return_value.put.assert_not_called() + + +if __name__ == "__main__": + unittest.main() From a774afb57ac213567b90167d0fe057b116ba9be1 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 21:10:10 -0700 Subject: [PATCH 02/38] rename function for consistency --- aws/lambda/oss-ci-job-queue-time/Makefile | 2 +- .../{oss_ci_job_queue_time.py => lambda_function.py} | 0 aws/lambda/oss-ci-job-queue-time/requirements.txt | 1 + .../{test_oss_ci_job_queue_time.py => test_lambda_function.py} | 2 +- 4 files changed, 3 insertions(+), 2 deletions(-) rename aws/lambda/oss-ci-job-queue-time/{oss_ci_job_queue_time.py => lambda_function.py} (100%) rename aws/lambda/oss-ci-job-queue-time/{test_oss_ci_job_queue_time.py => test_lambda_function.py} (99%) diff --git a/aws/lambda/oss-ci-job-queue-time/Makefile b/aws/lambda/oss-ci-job-queue-time/Makefile index ce75d870cc..478548770a 100644 --- a/aws/lambda/oss-ci-job-queue-time/Makefile +++ b/aws/lambda/oss-ci-job-queue-time/Makefile @@ -11,7 +11,7 @@ venv/bin/python: deployment.zip: mkdir -p deployment - cp oss_ci_job_queue_time.py ./deployment/. + cp lambda_function.py ./deployment/. pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade cd ./deployment && zip -q -r ../deployment.zip . diff --git a/aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py similarity index 100% rename from aws/lambda/oss-ci-job-queue-time/oss_ci_job_queue_time.py rename to aws/lambda/oss-ci-job-queue-time/lambda_function.py diff --git a/aws/lambda/oss-ci-job-queue-time/requirements.txt b/aws/lambda/oss-ci-job-queue-time/requirements.txt index 7a4ec20f9f..800c7ac1cb 100644 --- a/aws/lambda/oss-ci-job-queue-time/requirements.txt +++ b/aws/lambda/oss-ci-job-queue-time/requirements.txt @@ -1,2 +1,3 @@ clickhouse_connect==0.8.5 +boto3==1.35.33 pytest==7.4.0 diff --git a/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py similarity index 99% rename from aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py rename to aws/lambda/oss-ci-job-queue-time/test_lambda_function.py index c50fae5a90..926348b425 100644 --- a/aws/lambda/oss-ci-job-queue-time/test_oss_ci_job_queue_time.py +++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Tuple from unittest import mock from unittest.mock import MagicMock, patch -from oss_ci_job_queue_time import ( +from lambda_function import ( lambda_handler, get_aws_s3_resource, get_clickhouse_client, From 32f77b8b6fda234e42d39836a61860a66092e822 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 21:11:12 -0700 Subject: [PATCH 03/38] replace mock patch --- aws/lambda/oss-ci-job-queue-time/test_lambda_function.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py index 926348b425..2c3f7dfdae 100644 --- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py +++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py @@ -140,8 +140,8 @@ def setEnvironmentVariables(): class Test(unittest.TestCase): - @patch("oss_ci_job_queue_time.get_aws_s3_resource") - @patch("oss_ci_job_queue_time.get_clickhouse_client") + @patch("lambda_function.get_aws_s3_resource") + @patch("lambda_function.get_clickhouse_client") def test_lambda_handler_when_row_result_is_empty( self, mock_get_client, mock_s3_resource ): @@ -160,8 +160,8 @@ def test_lambda_handler_when_row_result_is_empty( mock_s3_resource ).return_value.put.assert_not_called() - @patch("oss_ci_job_queue_time.get_aws_s3_resource") - @patch("oss_ci_job_queue_time.get_clickhouse_client") + @patch("lambda_function.get_aws_s3_resource") + @patch("lambda_function.get_clickhouse_client") def test_lambda_handler_when_lambda_happy_flow_then_success( self, mock_get_client, mock_s3_resource ): From 8fcc0e792e741e48a36a7f72d83400596e93a35f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 21:25:24 -0700 Subject: [PATCH 04/38] replace mock patch --- .lintrunner.toml | 1 + .../oss-ci-job-queue-time/lambda_function.py | 15 +- .../test_lambda_function.py | 172 ++++++++---------- 3 files changed, 89 insertions(+), 99 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index 25b15bcf9d..71a7e7722c 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -43,6 +43,7 @@ include_patterns = [ 'torchci/**/*.py', 'torchci/**/*.pyi', '.github/scripts/*.py', + 'aws/lambda/oss-ci-job-queue-time/*.py', 'aws/lambda/whl_metadata_upload_pep658/**/*.py', ] command = [ diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py index 3aed64ed5c..743419954e 100644 --- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py +++ b/aws/lambda/oss-ci-job-queue-time/lambda_function.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python from functools import lru_cache import json from typing import Any @@ -211,19 +212,21 @@ def parse_args() -> argparse.Namespace: action="store_true", help="when set true, only print instead of writing results to s3", ) - return parser.parse_known_args()[0] + args,_ = parser.parse_known_args() + return args -def main(*args, **kwargs) -> None: +def main() -> None: """ Main method to run in local test environment """ - args = parse_args() + + arguments = parse_args() db_client = get_clickhouse_client( - host=args.clickhouse_endpoint, - user=args.clickhouse_username, - password=args.clickhouse_password, + host=arguments.clickhouse_endpoint, + user=arguments.clickhouse_username, + password=arguments.clickhouse_password, ) s3_client = get_aws_s3_resource() diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py index 2c3f7dfdae..f819127307 100644 --- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py +++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py @@ -18,99 +18,85 @@ def get_default_result_rows(test_sample: str = "0"): generate result rows for testing, this corrresponds to the following columns: 'queue_s', 'repo', 'workflow_name', 'job_name', 'html_url', 'machine_type', 'time' """ - match test_sample: - case "0": - return [ - ( - 60000, - "pytorch/pytorch", - "workflow-name-1", - "job-name-1", - "runs/1/job/1", - "linux.aws.h100", - 1742262372, - ), - ( - 1400, - "pytorch/pytorch", - "workflow-name-2", - "job-name-2", - "runs/2/job/2", - "linux.rocm.gpu.2", - 1742262372, - ), - ] - case "1": - return [ - ( - 60000, - "pytorch/pytorch", - "inductor-h100", - "test1 (h100, 5, 5, linux.aws.h100)", - "runs/1/job/1", - "linux.aws.h100", - 1742262372, - ), - ( - 50000, - "pytorch/pytorch", - "inductor-h100", - "test1 (h100, 5, 5, linux.aws.h100)", - "runs/1/job/2", - "linux.aws.h100", - 1742262372, - ), - ( - 55000, - "pytorch/pytorch", - "inductor-h100", - "test1 (h100, 2, 6, linux.aws.h100)", - "runs/1/job/3", - "linux.aws.h100", - 1742262372, - ), - ( - 1729, - "pytorch/pytorch", - "inductor-h100", - "test2 (h100, 1, 1, linux.aws.h100)", - "runs/2/job/1", - "linux.aws.h100", - 1742262372, - ), - ( - 1352, - "pytorch/pytorch", - "inductor-rocm", - "rocm-test1(1, 1, linux.rocm.gpu.2)", - "runs/3/job/1", - "linux.rocm.gpu.2", - 1742262372, - ), - ( - 1400, - "pytorch/pytorch", - "inductor-rocm", - "rocm-test1 (1, 1, linux.rocm.gpu.2)", - "runs/4/job/2", - "linux.rocm.gpu.2", - 1742262372, - ), - ] - case _: - return [] - - -def get_default_result_columns() -> Tuple: - return ( - "queue_s", - "repo", - "workflow_name", - "job_name", - "html_url", - "machine_type", - "time", - ) + if (test_sample == "0"): + return [ + ( + 60000, + "pytorch/pytorch", + "workflow-name-1", + "job-name-1", + "runs/1/job/1", + "linux.aws.h100", + 1742262372, + ), + ( + 1400, + "pytorch/pytorch", + "workflow-name-2", + "job-name-2", + "runs/2/job/2" + )] + + return [ + ( + 60000, + "pytorch/pytorch", + "inductor-h100", + "test1 (h100, 5, 5, linux.aws.h100)", + "runs/1/job/1", + "linux.aws.h100", + 1742262372, + ), + ( + 50000, + "pytorch/pytorch", + "inductor-h100", + "test1 (h100, 5, 5, linux.aws.h100)", + "runs/1/job/2", + "linux.aws.h100", + 1742262372, + ), + ( + 55000, + "pytorch/pytorch", + "inductor-h100", + "test1 (h100, 2, 6, linux.aws.h100)", + "runs/1/job/3", + "linux.aws.h100", + 1742262372, + ), + ( + 1729, + "pytorch/pytorch", + "inductor-h100", + "test2 (h100, 1, 1, linux.aws.h100)", + "runs/2/job/1", + "linux.aws.h100", + 1742262372, + ), + ( + 1352, + "pytorch/pytorch", + "inductor-rocm", + "rocm-test1(1, 1, linux.rocm.gpu.2)", + "runs/3/job/1", + "linux.rocm.gpu.2", + 1742262372, + ), + ( + 1400, + "pytorch/pytorch", + "inductor-rocm", + "rocm-test1 (1, 1, linux.rocm.gpu.2)", + "runs/4/job/2", + "linux.rocm.gpu.2", + 1742262372, + ), + ] + + +def get_default_result_columns(): + return "queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time" def mock_s3_resource_put(mock_s3_resource: Any) -> None: From 871a64678bea05c2891a5e6075fa741c2774fe33 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 21:28:19 -0700 Subject: [PATCH 05/38] replace mock patch --- aws/lambda/oss-ci-job-queue-time/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py index 743419954e..f0fc532bac 100644 --- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py +++ b/aws/lambda/oss-ci-job-queue-time/lambda_function.py @@ -4,7 +4,7 @@ from typing import Any import clickhouse_connect import os -import boto3 +import boto3 # type: ignore[import] import argparse from logging import info import logging From c2267bce66d134bfd2e5a9c99e31ab67d0856500 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 21:31:03 -0700 Subject: [PATCH 06/38] replace mock patch --- .../oss-ci-job-queue-time/lambda_function.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss-ci-job-queue-time/lambda_function.py index f0fc532bac..35814f2012 100644 --- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py +++ b/aws/lambda/oss-ci-job-queue-time/lambda_function.py @@ -1,16 +1,19 @@ #!/usr/bin/env python -from functools import lru_cache -import json -from typing import Any -import clickhouse_connect -import os -import boto3 # type: ignore[import] import argparse -from logging import info -import logging import io +import json +import logging +import os import gzip +import boto3 # type: ignore[import] +import clickhouse_connect + +# Local imports +from functools import lru_cache +from logging import info +from typing import Any + logging.basicConfig(level=logging.INFO) CLICKHOUSE_ENDPOINT = os.getenv("CLICKHOUSE_ENDPOINT", "") From 35289aa85fa491c4dadbdda0a1843cc11ac57aa3 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 21:33:49 -0700 Subject: [PATCH 07/38] fix test --- .../test_lambda_function.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py index f819127307..f8dc4e6f83 100644 --- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py +++ b/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py @@ -1,16 +1,14 @@ import unittest import os -import json -from re import M, T -from typing import Any, Dict, List, Tuple -from unittest import mock -from unittest.mock import MagicMock, patch +import gzip + +from typing import Any, List, Tuple +from unittest.mock import patch from lambda_function import ( lambda_handler, get_aws_s3_resource, get_clickhouse_client, ) -import gzip def get_default_result_rows(test_sample: str = "0"): @@ -34,7 +32,9 @@ def get_default_result_rows(test_sample: str = "0"): "pytorch/pytorch", "workflow-name-2", "job-name-2", - "runs/2/job/2" + "runs/2/job/2", + "linux.rocm.gpu.2", + 1742262372, )] return [ @@ -95,8 +95,8 @@ def get_default_result_rows(test_sample: str = "0"): ] -def get_default_result_columns(): - return "queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time" +def get_default_result_columns() -> Tuple: + return ("queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time") def mock_s3_resource_put(mock_s3_resource: Any) -> None: From 52dcbced3d9945bb89b5796e8ebde34fdc1c7f7d Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 22:52:17 -0700 Subject: [PATCH 08/38] fix test --- .github/workflows/tests.yml | 20 +++++++++++++++++++ .../.gitignore | 0 .../Makefile | 0 .../lambda_function.py | 1 + .../requirements.txt | 0 aws/lambda/tests/__init__.py | 2 ++ .../test_lambda_oss_ci_job_queue_time.py} | 10 +++++----- 7 files changed, 28 insertions(+), 5 deletions(-) rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/.gitignore (100%) rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/Makefile (100%) rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/lambda_function.py (99%) rename aws/lambda/{oss-ci-job-queue-time => oss_ci_job_queue_time}/requirements.txt (100%) create mode 100644 aws/lambda/tests/__init__.py rename aws/lambda/{oss-ci-job-queue-time/test_lambda_function.py => tests/test_lambda_oss_ci_job_queue_time.py} (94%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e2d8059c97..d9147b47a2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -39,6 +39,26 @@ jobs: # Test tools python3 -m unittest discover -vs tools/tests -p 'test_*.py' + test-aws-lambda: + name: Test github scripts + if: ${{ github.repository == 'pytorch/test-infra' }} + uses: ./.github/workflows/linux_job_v2.yml + with: + docker-image: python:3.10-slim-bullseye + runner: linux.large + script: | + # Environment setup + echo ::group::setup Python environment + python -m venv .venv/ + source .venv/bin/activate + pip install pip==23.0.1 pytest==7.2.0 \ + jsonschema==4.17.3 numpy==1.24.1 pandas==2.1.4 boto3==1.19.12 \ + clickhouse-connect==0.8.14 + echo ::endgroup:: + + # Test aws lambda + pytest -v aws/lambda/tests + test-github-scripts: name: Test github scripts if: ${{ github.repository == 'pytorch/test-infra' }} diff --git a/aws/lambda/oss-ci-job-queue-time/.gitignore b/aws/lambda/oss_ci_job_queue_time/.gitignore similarity index 100% rename from aws/lambda/oss-ci-job-queue-time/.gitignore rename to aws/lambda/oss_ci_job_queue_time/.gitignore diff --git a/aws/lambda/oss-ci-job-queue-time/Makefile b/aws/lambda/oss_ci_job_queue_time/Makefile similarity index 100% rename from aws/lambda/oss-ci-job-queue-time/Makefile rename to aws/lambda/oss_ci_job_queue_time/Makefile diff --git a/aws/lambda/oss-ci-job-queue-time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py similarity index 99% rename from aws/lambda/oss-ci-job-queue-time/lambda_function.py rename to aws/lambda/oss_ci_job_queue_time/lambda_function.py index 35814f2012..3c3150e15d 100644 --- a/aws/lambda/oss-ci-job-queue-time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -5,6 +5,7 @@ import logging import os import gzip +import sys import boto3 # type: ignore[import] import clickhouse_connect diff --git a/aws/lambda/oss-ci-job-queue-time/requirements.txt b/aws/lambda/oss_ci_job_queue_time/requirements.txt similarity index 100% rename from aws/lambda/oss-ci-job-queue-time/requirements.txt rename to aws/lambda/oss_ci_job_queue_time/requirements.txt diff --git a/aws/lambda/tests/__init__.py b/aws/lambda/tests/__init__.py new file mode 100644 index 0000000000..10abe4c5f4 --- /dev/null +++ b/aws/lambda/tests/__init__.py @@ -0,0 +1,2 @@ +# tests/__init__.py +# This file can be left empty or contain initialization code for the test suite. diff --git a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py similarity index 94% rename from aws/lambda/oss-ci-job-queue-time/test_lambda_function.py rename to aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index f8dc4e6f83..c3f394c162 100644 --- a/aws/lambda/oss-ci-job-queue-time/test_lambda_function.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -4,7 +4,7 @@ from typing import Any, List, Tuple from unittest.mock import patch -from lambda_function import ( +from oss_ci_job_queue_time.lambda_function import ( lambda_handler, get_aws_s3_resource, get_clickhouse_client, @@ -126,8 +126,8 @@ def setEnvironmentVariables(): class Test(unittest.TestCase): - @patch("lambda_function.get_aws_s3_resource") - @patch("lambda_function.get_clickhouse_client") + @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") + @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") def test_lambda_handler_when_row_result_is_empty( self, mock_get_client, mock_s3_resource ): @@ -146,8 +146,8 @@ def test_lambda_handler_when_row_result_is_empty( mock_s3_resource ).return_value.put.assert_not_called() - @patch("lambda_function.get_aws_s3_resource") - @patch("lambda_function.get_clickhouse_client") + @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") + @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") def test_lambda_handler_when_lambda_happy_flow_then_success( self, mock_get_client, mock_s3_resource ): From b6022f379b8463268e43f9a55925b72caaaf53aa Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 22:53:10 -0700 Subject: [PATCH 09/38] fix test --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d9147b47a2..6676a822d3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -40,7 +40,7 @@ jobs: python3 -m unittest discover -vs tools/tests -p 'test_*.py' test-aws-lambda: - name: Test github scripts + name: Test aws lambda if: ${{ github.repository == 'pytorch/test-infra' }} uses: ./.github/workflows/linux_job_v2.yml with: From b1064c71cdddc22c26197b5ee1047ce87e84251c Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 22:55:14 -0700 Subject: [PATCH 10/38] fix test --- .github/workflows/tests.yml | 5 ++--- aws/lambda/oss_ci_job_queue_time/requirements.txt | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6676a822d3..ca143a76ee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -51,9 +51,8 @@ jobs: echo ::group::setup Python environment python -m venv .venv/ source .venv/bin/activate - pip install pip==23.0.1 pytest==7.2.0 \ - jsonschema==4.17.3 numpy==1.24.1 pandas==2.1.4 boto3==1.19.12 \ - clickhouse-connect==0.8.14 + pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 \ + clickhouse-connect==0.8.5 echo ::endgroup:: # Test aws lambda diff --git a/aws/lambda/oss_ci_job_queue_time/requirements.txt b/aws/lambda/oss_ci_job_queue_time/requirements.txt index 800c7ac1cb..3e22fde96f 100644 --- a/aws/lambda/oss_ci_job_queue_time/requirements.txt +++ b/aws/lambda/oss_ci_job_queue_time/requirements.txt @@ -1,3 +1,2 @@ clickhouse_connect==0.8.5 boto3==1.35.33 -pytest==7.4.0 From 7a1b5aad3bc5fde4df8cb47f9ec8d62a534d8ff1 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 17 Mar 2025 22:55:42 -0700 Subject: [PATCH 11/38] fix test version --- .github/workflows/tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ca143a76ee..819f028e41 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -51,8 +51,7 @@ jobs: echo ::group::setup Python environment python -m venv .venv/ source .venv/bin/activate - pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 \ - clickhouse-connect==0.8.5 + pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 clickhouse-connect==0.8.5 echo ::endgroup:: # Test aws lambda From 3baa9209323721d2c7ff12b2fb702c696ff7b7da Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 13:51:58 -0700 Subject: [PATCH 12/38] fix test version --- .../oss_ci_job_queue_time/lambda_function.py | 53 ++++++++++--------- .../test_lambda_oss_ci_job_queue_time.py | 8 +-- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 3c3150e15d..16fb868552 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -17,15 +17,11 @@ logging.basicConfig(level=logging.INFO) -CLICKHOUSE_ENDPOINT = os.getenv("CLICKHOUSE_ENDPOINT", "") -CLICKHOUSE_USERNAME = os.getenv("CLICKHOUSE_USERNAME", "default") -CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "") - @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: return clickhouse_connect.get_client( - host=host, user=user, password=password, secure=True + host=host, user=user, password=password, secure=True, verify=False ) @@ -34,13 +30,15 @@ def get_aws_s3_resource() -> Any: return boto3.resource("s3") -def get_clickhouse_client_handler() -> Any: +def get_clickhouse_client_environment() -> Any: for env in ["CLICKHOUSE_ENDPOINT", "CLICKHOUSE_USERNAME", "CLICKHOUSE_PASSWORD"]: if not os.getenv(env): raise ValueError(f"Missing environment variable {env}") return get_clickhouse_client( - host=CLICKHOUSE_ENDPOINT, user=CLICKHOUSE_USERNAME, password=CLICKHOUSE_PASSWORD + host=os.getenv("CLICKHOUSE_ENDPOINT"), + user=os.getenv("CLICKHOUSE_USERNAME"), + password=os.getenv("CLICKHOUSE_PASSWORD"), ) @@ -183,7 +181,7 @@ def lambda_handler(event: Any, context: Any) -> None: """ Main method to run in aws lambda environment """ - db_client = get_clickhouse_client_handler() + db_client = get_clickhouse_client_environment() s3_client = get_aws_s3_resource() QueueTimeProcessor(db_client, s3_client).process() @@ -197,47 +195,50 @@ def parse_args() -> argparse.Namespace: """ parser = argparse.ArgumentParser() parser.add_argument( - "--clickhouse_endpoint", + "--clickhouse-endpoint", + default=os.getenv("CLICKHOUSE_ENDPOINT", ""), type=str, - required=True, help="the clickhouse endpoint, the clickhouse_endpoint name is https://{clickhouse_endpoint}:{port} for full url ", ) parser.add_argument( - "--clickhouse_username", type=str, required=True, help="the clickhouse username" + "--clickhouse-username", + type=str, + default=os.getenv("CLICKHOUSE_USERNAME", ""), + help="the clickhouse username", ) parser.add_argument( - "--clickhouse_password", + "--clickhouse-password", type=str, - required=True, + default=os.getenv("CLICKHOUSE_PASSWORD", ""), help="the clickhouse password for the user name", ) parser.add_argument( - "--dry-run", + "--not-dry-run", action="store_true", - help="when set true, only print instead of writing results to s3", + help="when set true, writing results to s3 from local . By default, local run is dry run mode", ) - args,_ = parser.parse_known_args() + args, _ = parser.parse_known_args() return args def main() -> None: """ - Main method to run in local test environment + method to run in local test environment """ arguments = parse_args() - db_client = get_clickhouse_client( - host=arguments.clickhouse_endpoint, - user=arguments.clickhouse_username, - password=arguments.clickhouse_password, - ) - s3_client = get_aws_s3_resource() + # update environment variables for input parameters + os.environ["CLICKHOUSE_ENDPOINT"] = arguments.clickhouse_endpoint + os.environ["CLICKHOUSE_USERNAME"] = arguments.clickhouse_username + os.environ["CLICKHOUSE_PASSWORD"] = arguments.clickhouse_password - # process the queue time events - QueueTimeProcessor(db_client, s3_client, is_dry_run=True).process() + db_client = get_clickhouse_client_environment() + s3_client = get_aws_s3_resource() - return + # always run in dry run mode in local test environment, unless it's disabled. + is_dry_run = not arguments.not_dry_run + QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process() if __name__ == "__main__": diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index c3f394c162..c1d7258b4a 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -119,7 +119,7 @@ def mock_db_client( mock_client.query.return_value.column_names = result_columns -def setEnvironmentVariables(): +def set_default_env_variables(): os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1" os.environ["CLICKHOUSE_USERNAME"] = "user1" os.environ["CLICKHOUSE_PASSWORD"] = "pwd1" @@ -133,7 +133,7 @@ def test_lambda_handler_when_row_result_is_empty( ): print("test_lambda_handler_when_row_result_is_empty ") # prepare - setEnvironmentVariables() + set_default_env_variables() mock_s3_resource_put(mock_s3_resource) mock_db_client(mock_get_client, result_rows=[]) @@ -152,7 +152,7 @@ def test_lambda_handler_when_lambda_happy_flow_then_success( self, mock_get_client, mock_s3_resource ): # prepare - setEnvironmentVariables() + set_default_env_variables() mock_s3_resource_put(mock_s3_resource) mock_db_client(mock_get_client) @@ -198,7 +198,7 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error( mock_get_client.reset_mock(return_value=True) mock_s3_resource.reset_mock(return_value=True) - setEnvironmentVariables() + set_default_env_variables() os.environ[x] = "" # execute From ca243764f603cf8f27ee81a065adcb750d41d86b Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 20:05:09 -0700 Subject: [PATCH 13/38] fix sync --- .../oss_ci_job_queue_time/lambda_function.py | 184 +++++++++++++----- torchci/pages/api/clickhouse/[queryName].ts | 1 + 2 files changed, 131 insertions(+), 54 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 16fb868552..f36a016692 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -5,10 +5,10 @@ import logging import os import gzip -import sys import boto3 # type: ignore[import] import clickhouse_connect +from datetime import datetime, time # Local imports from functools import lru_cache @@ -17,6 +17,34 @@ logging.basicConfig(level=logging.INFO) +_bucket_name = "ossci-raw-job-status" +_in_queue_job_select_statement = """ +SELECT + DATE_DIFF( + 'second', + job.created_at, + {timestamp:DateTime} + ) AS queue_s, + workflow.repository.'full_name' AS repo, + workflow.name AS workflow_name, + job.name AS job_name, + job.html_url, + IF( + LENGTH(job.labels) = 0, + 'N/A', + IF( + LENGTH(job.labels) > 1, + job.labels[2], + job.labels[1] + ) + ) AS machine_type, + toUnixTimestamp({timestamp:DateTime}) AS time, + toUnixTimestamp(job.started_at) as started_at, + toUnixTimestamp(job.created_at) as created_at +FROM + default.workflow_job job FINAL + JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id +""" @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: @@ -64,53 +92,78 @@ def upload_to_s3_txt( ) info(f"Done! Finish writing document to S3 {bucket_name}/{key} ") +def query_picked_up_job_for_given_snapshot(time:str,repo: str = 'pytorch/pytorch'): + """ + this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later + """ + s1 = """ + WITH possible_queued_jobs AS ( + SELECT + id, + run_id, + started_at, + created_at + FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed + WHERE + started_at > ({timestamp:DateTime}) + AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE) + AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) + )""" + + s2 = """ + WHERE + job.id IN (SELECT id FROM possible_queued_jobs) + AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) + AND workflow.repository.'full_name' = {repo:String} + AND job.status = 'completed' + AND LENGTH(job.steps) != 0 + AND workflow.status = 'completed' + ORDER BY + queue_s DESC + """ + query = s1 + _in_queue_job_select_statement + s2 + + parameters={ + 'timestamp': time , + 'repo': repo, + } + return query,parameters -def query_in_queue_jobs_now() -> str: - query = """ +def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch'): + """ + this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers + """ + s1 = """ WITH possible_queued_jobs AS ( SELECT id, - run_id + run_id, + started_at, + created_at FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed WHERE status = 'queued' - AND created_at < (CURRENT_TIMESTAMP() - INTERVAL 5 MINUTE) - AND created_at > (CURRENT_TIMESTAMP() - INTERVAL 1 WEEK) + AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE) + AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) ) - SELECT - DATE_DIFF( - 'second', - job.created_at, - CURRENT_TIMESTAMP() - ) AS queue_s, - workflow.repository.'full_name' AS repo, - workflow.name AS workflow_name, - job.name AS job_name, - job.html_url, - IF( - LENGTH(job.labels) = 0, - 'N/A', - IF( - LENGTH(job.labels) > 1, - job.labels[2], - job.labels[1] - ) - ) AS machine_type, - toUnixTimestamp(CURRENT_TIMESTAMP()) AS time - FROM - default.workflow_job job FINAL - JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id + """ + s2 =""" WHERE job.id IN (SELECT id FROM possible_queued_jobs) AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) - AND workflow.repository.'full_name' = 'pytorch/pytorch' + AND workflow.repository.'full_name' = {repo:String} AND job.status = 'queued' AND LENGTH(job.steps) = 0 AND workflow.status != 'completed' ORDER BY - queue_s DESC """ - return query - + queue_s DESC + """ + query = s1 + _in_queue_job_select_statement + s2 + parameters={ + 'timestamp': time , + 'repo': repo, + } + return query, parameters class QueueTimeProcessor: """ @@ -120,7 +173,6 @@ class QueueTimeProcessor: processor = QueueTimeProcessor(clickhouse_client,s3_client) processor.process() """ - def __init__( self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False ) -> None: @@ -131,33 +183,57 @@ def __init__( def process(self) -> None: self.proceses_job_queue_times_historical() - def proceses_job_queue_times_historical(self) -> None: - jobs_in_queue = self.get_jobs_in_queue_now() + def proceses_job_queue_times_historical(self, timestamp:str = "", repo: str = 'pytorch/pytorch') -> None: + # by default, we use current time as snapshot + snapshot_time = str(int(datetime.now().timestamp())) + if timestamp: + snapshot_time = timestamp - if len(jobs_in_queue) == 0: - info("No jobs in queue now, skipping writing to s3") - return - info(f"Found {len(jobs_in_queue)} jobs in queue now") - info(f"Peeking data: {jobs_in_queue[0]}") + # fetches jobs that were in queue in given snapshot time, that are not being picked up by workers + queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot(timestamp,repo) + jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters) - bucket_name = "ossci-raw-job-status" - repo = jobs_in_queue[0]["repo"] - time = jobs_in_queue[0]["time"] + # fetches jobs that were in queue in given snapshot time, but were picked up by workers later of given snapshot time + picked_query, picked_params = query_picked_up_job_for_given_snapshot(timestamp,repo) + jobs_pick = self.process_in_queue_jobs(picked_query, picked_params) - key = f"job_queue_times_historical/{repo}/{time}.txt" + datetime_str = datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S') + print(datetime_str,timestamp,len(jobs_in_queue),len(jobs_pick),) + + info(f"Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later") + if len(jobs_in_queue) == 0 and len(jobs_pick) == 0: + info(f"No jobs in queue at time {datetime_str}, skipping mutation to S3") + return + key = f"job_queue_times_historical/{repo}/{timestamp}.txt" + result = jobs_in_queue + jobs_pick if self.is_dry_run: info( - f"[Dry Run Mode]: {len(jobs_in_queue)} records to S3 {bucket_name}/{key}" + f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}" ) - info(json.dumps(jobs_in_queue, indent=4)) + print(json.dumps(result)) return - upload_to_s3_txt(self.s3_client, bucket_name, key, jobs_in_queue) - - def get_jobs_in_queue_now(self) -> list[dict[str, Any]]: - reader = self.clickhouse_client.query(query_in_queue_jobs_now()) + upload_to_s3_txt(self.s3_client, _bucket_name, key, result) + + def process_in_queue_jobs(self, queryStr:str, parameters:Any) -> list[dict[str, Any]]: + """ + post query process to remove duplicated jobs + this is bc clickhouse client returns duplicated jobs for some reason + """ + seen = set() + db_resp = self.query(queryStr, parameters) + result = [] + for record in db_resp: + if record['html_url']in seen: + continue + seen.add(record['html_url']) + result.append(record) + return result + + def query(self, query, params={}) -> list[dict[str, Any]]: + reader = self.clickhouse_client.query(query, params) # clickhouse returns a generator to return column names and rows # see https://clickhouse.com/docs/integrations/python#the-queryresult-object column_names = reader.column_names @@ -215,7 +291,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--not-dry-run", action="store_true", - help="when set true, writing results to s3 from local . By default, local run is dry run mode", + help="when set, writing results to s3 from local environment. By default, we run in dry-run mode for local environment", ) args, _ = parser.parse_known_args() return args @@ -236,10 +312,10 @@ def main() -> None: db_client = get_clickhouse_client_environment() s3_client = get_aws_s3_resource() - # always run in dry run mode in local test environment, unless it's disabled. + # always run in dry-run mode in local environment, unless it's disabled. is_dry_run = not arguments.not_dry_run - QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process() + QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process() if __name__ == "__main__": main() diff --git a/torchci/pages/api/clickhouse/[queryName].ts b/torchci/pages/api/clickhouse/[queryName].ts index e0461e5982..01c4f0f51d 100644 --- a/torchci/pages/api/clickhouse/[queryName].ts +++ b/torchci/pages/api/clickhouse/[queryName].ts @@ -10,5 +10,6 @@ export default async function handler( queryName, JSON.parse(req.query.parameters as string) ); + res.status(200).json(response); } From c64762f4531f1ddcdff4419fac232ea4665012c2 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 20:59:40 -0700 Subject: [PATCH 14/38] fix sync --- .../oss_ci_job_queue_time/lambda_function.py | 76 +++++++++++-------- .../test_lambda_oss_ci_job_queue_time.py | 30 +++++--- 2 files changed, 62 insertions(+), 44 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index f36a016692..e26e3fc9b7 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -39,13 +39,12 @@ ) ) AS machine_type, toUnixTimestamp({timestamp:DateTime}) AS time, - toUnixTimestamp(job.started_at) as started_at, - toUnixTimestamp(job.created_at) as created_at FROM default.workflow_job job FINAL JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id """ + @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: return clickhouse_connect.get_client( @@ -92,9 +91,10 @@ def upload_to_s3_txt( ) info(f"Done! Finish writing document to S3 {bucket_name}/{key} ") -def query_picked_up_job_for_given_snapshot(time:str,repo: str = 'pytorch/pytorch'): + +def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"): """ - this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later + this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later """ s1 = """ WITH possible_queued_jobs AS ( @@ -123,15 +123,16 @@ def query_picked_up_job_for_given_snapshot(time:str,repo: str = 'pytorch/pytorch """ query = s1 + _in_queue_job_select_statement + s2 - parameters={ - 'timestamp': time , - 'repo': repo, + parameters = { + "timestamp": time, + "repo": repo, } - return query,parameters + return query, parameters -def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch'): + +def query_in_queue_jobs_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"): """ - this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers + this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers """ s1 = """ WITH possible_queued_jobs AS ( @@ -147,7 +148,7 @@ def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) ) """ - s2 =""" + s2 = """ WHERE job.id IN (SELECT id FROM possible_queued_jobs) AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) @@ -159,12 +160,13 @@ def query_in_queue_jobs_for_given_snapshot(time:str, repo:str = 'pytorch/pytorch queue_s DESC """ query = s1 + _in_queue_job_select_statement + s2 - parameters={ - 'timestamp': time , - 'repo': repo, + parameters = { + "timestamp": time, + "repo": repo, } return query, parameters + class QueueTimeProcessor: """ this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time @@ -173,6 +175,7 @@ class QueueTimeProcessor: processor = QueueTimeProcessor(clickhouse_client,s3_client) processor.process() """ + def __init__( self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False ) -> None: @@ -183,25 +186,32 @@ def __init__( def process(self) -> None: self.proceses_job_queue_times_historical() - def proceses_job_queue_times_historical(self, timestamp:str = "", repo: str = 'pytorch/pytorch') -> None: + def proceses_job_queue_times_historical( + self, snapshot_time: str = "", repo: str = "pytorch/pytorch" + ) -> None: # by default, we use current time as snapshot - snapshot_time = str(int(datetime.now().timestamp())) - if timestamp: - snapshot_time = timestamp - + timestamp = str(int(datetime.now().timestamp())) + if snapshot_time: + timestamp = snapshot_time # fetches jobs that were in queue in given snapshot time, that are not being picked up by workers - queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot(timestamp,repo) + queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot( + timestamp, repo + ) jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters) # fetches jobs that were in queue in given snapshot time, but were picked up by workers later of given snapshot time - picked_query, picked_params = query_picked_up_job_for_given_snapshot(timestamp,repo) + picked_query, picked_params = query_picked_up_job_for_given_snapshot( + timestamp, repo + ) jobs_pick = self.process_in_queue_jobs(picked_query, picked_params) - datetime_str = datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S') - print(datetime_str,timestamp,len(jobs_in_queue),len(jobs_pick),) - - info(f"Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later") + datetime_str = datetime.fromtimestamp(int(timestamp)).strftime( + "%Y-%m-%d %H:%M:%S" + ) + info( + f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later" + ) if len(jobs_in_queue) == 0 and len(jobs_pick) == 0: info(f"No jobs in queue at time {datetime_str}, skipping mutation to S3") return @@ -209,26 +219,27 @@ def proceses_job_queue_times_historical(self, timestamp:str = "", repo: str = 'p key = f"job_queue_times_historical/{repo}/{timestamp}.txt" result = jobs_in_queue + jobs_pick if self.is_dry_run: - info( - f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}" - ) + info(f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}") print(json.dumps(result)) return upload_to_s3_txt(self.s3_client, _bucket_name, key, result) - def process_in_queue_jobs(self, queryStr:str, parameters:Any) -> list[dict[str, Any]]: + def process_in_queue_jobs( + self, queryStr: str, parameters: Any + ) -> list[dict[str, Any]]: """ post query process to remove duplicated jobs this is bc clickhouse client returns duplicated jobs for some reason """ seen = set() - db_resp = self.query(queryStr, parameters) + db_resp = self.query(queryStr, parameters) result = [] + for record in db_resp: - if record['html_url']in seen: + if record["html_url"] in seen: continue - seen.add(record['html_url']) + seen.add(record["html_url"]) result.append(record) return result @@ -317,5 +328,6 @@ def main() -> None: QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process() + if __name__ == "__main__": main() diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index c1d7258b4a..3de3e066d8 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -2,12 +2,10 @@ import os import gzip -from typing import Any, List, Tuple -from unittest.mock import patch +from typing import Any, List, Tuple, Dict +from unittest.mock import patch,MagicMock from oss_ci_job_queue_time.lambda_function import ( lambda_handler, - get_aws_s3_resource, - get_clickhouse_client, ) @@ -94,10 +92,18 @@ def get_default_result_rows(test_sample: str = "0"): ), ] - def get_default_result_columns() -> Tuple: return ("queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time") +def mock_query_result(query: str, parameters:str, rows_in_queue: List[Tuple], rows_picked: List[Tuple]) ->Any: + result = MagicMock() + if "LENGTH(job.steps) = 0" in query: + result.column_names = get_default_result_columns() + result.result_rows = rows_in_queue + if "LENGTH(job.steps) != 0'" in query: + result.column_names = get_default_result_columns() + result.result_rows = rows_picked + return result def mock_s3_resource_put(mock_s3_resource: Any) -> None: mock_s3 = mock_s3_resource.return_value @@ -111,13 +117,13 @@ def get_mock_s3_resource_object(mock_s3_resource: Any): def mock_db_client( mock: Any, - result_rows: List[Tuple] = get_default_result_rows(), - result_columns: Tuple = get_default_result_columns(), + rows_in_queue: List[Tuple] = get_default_result_rows(), + rows_picked: List[Tuple] = [], ) -> None: mock_client = mock.return_value - mock_client.query.return_value.result_rows = result_rows - mock_client.query.return_value.column_names = result_columns - + mock_client.query.side_effect = ( + lambda query, parameters: mock_query_result(query,parameters, rows_in_queue, rows_picked) + ) def set_default_env_variables(): os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1" @@ -135,7 +141,7 @@ def test_lambda_handler_when_row_result_is_empty( # prepare set_default_env_variables() mock_s3_resource_put(mock_s3_resource) - mock_db_client(mock_get_client, result_rows=[]) + mock_db_client(mock_get_client,[],[]) # execute lambda_handler(None, None) @@ -168,7 +174,7 @@ def test_lambda_handler_when_lambda_happy_flow_then_success( # assert clickhouse client mock_get_client.assert_called_once() - mock_get_client.return_value.query.assert_called_once() + self.assertEqual(mock_get_client.return_value.query.call_count, 2) # assert s3 resource mock_s3_resource.assert_called_once() From 2814b4b7025b38cc2f061577416902b660247ab5 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 21:02:16 -0700 Subject: [PATCH 15/38] fix sync --- .../test_lambda_oss_ci_job_queue_time.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index 3de3e066d8..8ee60e0a1c 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -3,7 +3,7 @@ import gzip from typing import Any, List, Tuple, Dict -from unittest.mock import patch,MagicMock +from unittest.mock import patch, MagicMock from oss_ci_job_queue_time.lambda_function import ( lambda_handler, ) @@ -14,7 +14,7 @@ def get_default_result_rows(test_sample: str = "0"): generate result rows for testing, this corrresponds to the following columns: 'queue_s', 'repo', 'workflow_name', 'job_name', 'html_url', 'machine_type', 'time' """ - if (test_sample == "0"): + if test_sample == "0": return [ ( 60000, @@ -33,7 +33,8 @@ def get_default_result_rows(test_sample: str = "0"): "runs/2/job/2", "linux.rocm.gpu.2", 1742262372, - )] + ), + ] return [ ( @@ -92,10 +93,22 @@ def get_default_result_rows(test_sample: str = "0"): ), ] + def get_default_result_columns() -> Tuple: - return ("queue_s", "repo", "workflow_name", "job_name", "html_url", "machine_type","time") + return ( + "queue_s", + "repo", + "workflow_name", + "job_name", + "html_url", + "machine_type", + "time", + ) -def mock_query_result(query: str, parameters:str, rows_in_queue: List[Tuple], rows_picked: List[Tuple]) ->Any: + +def mock_query_result( + query: str, parameters: str, rows_in_queue: List[Tuple], rows_picked: List[Tuple] +) -> Any: result = MagicMock() if "LENGTH(job.steps) = 0" in query: result.column_names = get_default_result_columns() @@ -105,6 +118,7 @@ def mock_query_result(query: str, parameters:str, rows_in_queue: List[Tuple], ro result.result_rows = rows_picked return result + def mock_s3_resource_put(mock_s3_resource: Any) -> None: mock_s3 = mock_s3_resource.return_value mock_object = mock_s3.Object.return_value @@ -121,10 +135,11 @@ def mock_db_client( rows_picked: List[Tuple] = [], ) -> None: mock_client = mock.return_value - mock_client.query.side_effect = ( - lambda query, parameters: mock_query_result(query,parameters, rows_in_queue, rows_picked) + mock_client.query.side_effect = lambda query, parameters: mock_query_result( + query, parameters, rows_in_queue, rows_picked ) + def set_default_env_variables(): os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1" os.environ["CLICKHOUSE_USERNAME"] = "user1" @@ -141,7 +156,7 @@ def test_lambda_handler_when_row_result_is_empty( # prepare set_default_env_variables() mock_s3_resource_put(mock_s3_resource) - mock_db_client(mock_get_client,[],[]) + mock_db_client(mock_get_client, [], []) # execute lambda_handler(None, None) From ec39f93bbbd7b8350da11b9e870c951e26f70b8c Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 21:10:40 -0700 Subject: [PATCH 16/38] fix sync --- .../oss_ci_job_queue_time/lambda_function.py | 5 +++- .../test_lambda_oss_ci_job_queue_time.py | 29 +++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index e26e3fc9b7..eb00b37f54 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -18,6 +18,7 @@ logging.basicConfig(level=logging.INFO) _bucket_name = "ossci-raw-job-status" +# common query statement for in_queue jobs _in_queue_job_select_statement = """ SELECT DATE_DIFF( @@ -209,11 +210,13 @@ def proceses_job_queue_times_historical( datetime_str = datetime.fromtimestamp(int(timestamp)).strftime( "%Y-%m-%d %H:%M:%S" ) + info( f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later" ) + if len(jobs_in_queue) == 0 and len(jobs_pick) == 0: - info(f"No jobs in queue at time {datetime_str}, skipping mutation to S3") + info(f"No jobs were in queue at time {datetime_str}, skipping") return key = f"job_queue_times_historical/{repo}/{timestamp}.txt" diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index 8ee60e0a1c..6fbcbc4f74 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -4,9 +4,7 @@ from typing import Any, List, Tuple, Dict from unittest.mock import patch, MagicMock -from oss_ci_job_queue_time.lambda_function import ( - lambda_handler, -) +from oss_ci_job_queue_time.lambda_function import lambda_handler, main def get_default_result_rows(test_sample: str = "0"): @@ -233,6 +231,31 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error( mock_s3_resource ).return_value.put.assert_not_called() + @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") + @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") + def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_write( + self, mock_get_client, mock_s3_resource + ): + # prepare + set_default_env_variables() + mock_s3_resource_put(mock_s3_resource) + mock_db_client(mock_get_client) + + # execute + main() + + # assert + + # assert clickhouse client + mock_get_client.assert_called_once() + self.assertEqual(mock_get_client.return_value.query.call_count, 2) + + # assert s3 resource + mock_s3_resource.assert_called_once() + get_mock_s3_resource_object( + mock_s3_resource + ).return_value.put.assert_not_called() + if __name__ == "__main__": unittest.main() From 893e7191f999a1ea73af16527de9144ee50efd58 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 23:34:40 -0700 Subject: [PATCH 17/38] fix sync --- .../oss_ci_job_queue_time/lambda_function.py | 365 ++++++++++++++++-- .../oss_ci_job_queue_time/requirements.txt | 3 + .../test_lambda_oss_ci_job_queue_time.py | 89 +++-- 3 files changed, 390 insertions(+), 67 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index eb00b37f54..019bada68f 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -5,6 +5,9 @@ import logging import os import gzip +import threading +import dateutil.parser +import yaml import boto3 # type: ignore[import] import clickhouse_connect @@ -13,12 +16,14 @@ # Local imports from functools import lru_cache from logging import info -from typing import Any +from typing import Any, Optional, Dict, Set, Iterable, List, Tuple +from github import Github, Auth +from dateutil.parser import parse + logging.basicConfig(level=logging.INFO) _bucket_name = "ossci-raw-job-status" -# common query statement for in_queue jobs _in_queue_job_select_statement = """ SELECT DATE_DIFF( @@ -39,7 +44,7 @@ job.labels[1] ) ) AS machine_type, - toUnixTimestamp({timestamp:DateTime}) AS time, + toUnixTimestamp({timestamp:DateTime}) AS time FROM default.workflow_job job FINAL JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id @@ -93,7 +98,248 @@ def upload_to_s3_txt( info(f"Done! Finish writing document to S3 {bucket_name}/{key} ") -def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"): +class LazyFileHistory: + """ + Reads the content of a file from a GitHub repository on the version that it was on a specific time and date provided. It then caches the commits and file contents avoiding unnecessary requests to the GitHub API. + All public methods are thread-safe. + """ + + def __init__(self, repo: Any, path: str) -> None: + self.repo = repo + self.path = path + self._commits_cache = [] + self._content_cache = {} + self._fetched_all_commits = False + self._lock = threading.RLock() + + def is_unix_timestamp(self, value: str) -> bool: + """Check if the string is a valid Unix timestamp.""" + if value.isdigit(): # Ensure it's numeric + try: + timestamp = int(value) + # Check if it's within a reasonable range (1970 to 2100) + datetime.fromtimestamp(timestamp) + return True + except (ValueError, OSError): + return False + return False + + def get_version_after_timestamp(self, timestamp: str | datetime) -> Optional[str]: + try: + with self._lock: + if not isinstance(timestamp, datetime): + if self.is_unix_timestamp(timestamp): + timestamp = datetime.fromtimestamp( + float(timestamp) + ).astimezone() + else: + timestamp = parse(timestamp) + commit = self._find_earliest_after_in_cache(timestamp) + if commit: + return self._fetch_content_for_commit(commit) + + if not self._fetched_all_commits: + commit = self._fetch_until_timestamp(timestamp) + if commit: + return self._fetch_content_for_commit(commit) + except Exception as e: + print( + f"Error fetching content for {self.repo} : {self.path} at {timestamp}: {e}" + ) + + return None + + def _find_earliest_after_in_cache(self, timestamp: datetime) -> Optional[str]: + commits_after = [ + c for c in self._commits_cache if c.commit.author.date > timestamp + ] + if not commits_after: + return None + return commits_after[-1] + + def _fetch_until_timestamp(self, timestamp: datetime) -> Optional[str]: + all_commits = self.repo.get_commits(path=self.path) + known_shas = {c.sha for c in self._commits_cache} + + newly_fetched = [] + + for commit in all_commits: + if commit.sha in known_shas: + break + newly_fetched.append(commit) + + if commit.commit.author.date <= timestamp: + break + + self._commits_cache.extend(newly_fetched) + self._commits_cache.sort(key=lambda c: c.commit.author.date, reverse=True) + + if not newly_fetched: + self._fetched_all_commits = True + + return self._find_earliest_after_in_cache(timestamp) + + def _fetch_content_for_commit(self, commit: any) -> str: + if commit.sha not in self._content_cache: + print( + f"Fetching content for {self.repo} : {self.path} at {commit.commit.author.date} - {commit.sha}" + ) + # We can retrieve the file content at a specific commit + file_content = self.repo.get_contents( + self.path, ref=commit.sha + ).decoded_content.decode() + self._content_cache[commit.sha] = file_content + return self._content_cache[commit.sha] + + +def explode_runner_variants( + runner_configs: Dict[str, Dict[str, Any]] +) -> Dict[str, Dict[str, Any]]: + runner_types_list = [i for i in runner_configs["runner_types"].items()] + + for runner, runner_config in runner_types_list: + if "variants" in runner_config: + for variant, variant_config in runner_config["variants"].items(): + if runner.startswith("lf."): + runner_without_lf = runner[3:] + variant_name = f"lf.{variant}.{runner_without_lf}" + else: + variant_name = f"{variant}.{runner}" + runner_configs["runner_types"][variant_name] = { + **runner_config, + **variant_config, + } + return runner_configs + + +def update_tags( + tag_categories: Dict[str, Set[str]], machine_types: Iterable[str] +) -> None: + """ + iterate through machine types from jobs, and update potential tags that it belongs to + """ + for machine_type in machine_types: + if not machine_type: + continue + tag_categories["all"].add(machine_type) + if machine_type not in tag_categories["dynamic"]: + if "ubuntu" in machine_type.lower(): + tag_categories["linux"].add(machine_type) + tag_categories["github"].add(machine_type) + else: + tag_categories["other"].add(machine_type) + + +def create_tag_categorires( + runner_configs: Dict[str, Dict[str, Any]], + lf_runner_configs: Dict[str, Dict[str, Any]], +) -> Dict[str, Set[str]]: + """ + Create the tag_categorires, that are groups of runners with some common characteristics that we might find relevant + to view them in a group instead of individually. + """ + breakdowns = { + "github": set(), # provided by github + "pet": set(), # managed as pet instances + "dynamic": set(), # managed as auto-scaling instances + "ephemeral": set(), # auto-scaling instances that are ephemeral + "nonephemeral": set(), # auto-scaling instances that are not ephemeral + "linux": set(), # linux instances + "linux-meta": set(), # linux instances provided by meta + "linux-lf": set(), # linux instances provided by Linux Foundation + "macos": set(), # macos instances + "macos-meta": set(), # macos instances provided by meta + "windows": set(), # windows instances + "windows-meta": set(), # windows instances provided by meta + "windows-lf": set(), # windows instances provided by Linux Foundation + "all": set(), # all instances + "lf": set(), # instances managed by Linux Foundation + "meta": set(), # instances managed by meta + "multi-tenant": set(), # instances that are multi-tenant + "other": set(), # other instances + } + + github_mac_runners = ( + "macos-12", + "macos-12-xl", + "macos-13-large", + "macos-13-xl", + "macos-13-xlarge", + "macos-14-arm64", + "macos-14-xlarge", + ) + breakdowns["github"].update(github_mac_runners) + breakdowns["macos"].update(github_mac_runners) + + meta_pet_mac_runners = ( + "macos-m1-12", + "macos-m1-13", + "macos-m1-14", + "macos-m1-stable", + "macos-m2-14", + "macos-m2-15", + "macos-m2-max", + ) + breakdowns["meta"].update(meta_pet_mac_runners) + breakdowns["macos"].update(meta_pet_mac_runners) + breakdowns["pet"].update(meta_pet_mac_runners) + + meta_pet_nvidia = ( + "linux.aws.a100", + "linux.aws.h100", + ) + breakdowns["meta"].update(meta_pet_nvidia) + breakdowns["linux"].update(meta_pet_nvidia) + breakdowns["linux-meta"].update(meta_pet_nvidia) + breakdowns["pet"].update(meta_pet_nvidia) + breakdowns["multi-tenant"].update(meta_pet_nvidia) + + all_runners_configs = ( + runner_configs["runner_types"] | lf_runner_configs["runner_types"] + ) + + for runner, runner_config in all_runners_configs.items(): + breakdowns["dynamic"].add(runner) + + if "is_ephemeral" in runner_config and runner_config["is_ephemeral"]: + breakdowns["ephemeral"].add(runner) + else: + breakdowns["nonephemeral"].add(runner) + + if runner_config["os"].lower() == "linux": + breakdowns["linux"].add(runner) + elif runner_config["os"].lower() == "windows": + breakdowns["windows"].add(runner) + + for runner, runner_config in runner_configs["runner_types"].items(): + breakdowns["meta"].add(runner) + + if runner_config["os"].lower() == "linux": + breakdowns["linux-meta"].add(runner) + elif runner_config["os"].lower() == "windows": + breakdowns["windows-meta"].add(runner) + + for runner, runner_config in lf_runner_configs["runner_types"].items(): + breakdowns["lf"].add(runner) + + if runner_config["os"].lower() == "linux": + breakdowns["linux-lf"].add(runner) + elif runner_config["os"].lower() == "windows": + breakdowns["windows-lf"].add(runner) + + return breakdowns + + +def get_runner_config( + retriever: LazyFileHistory, start_time: str | datetime +) -> Dict[str, Dict[str, Any]]: + contents = retriever.get_version_after_timestamp(start_time) + if contents: + return explode_runner_variants(yaml.safe_load(contents)) + return {"runner_types": {}} + + +def get_query_statement_for_picked_up_job(time: str, repo: str = "pytorch/pytorch"): """ this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later """ @@ -123,7 +369,6 @@ def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytor queue_s DESC """ query = s1 + _in_queue_job_select_statement + s2 - parameters = { "timestamp": time, "repo": repo, @@ -131,7 +376,7 @@ def query_picked_up_job_for_given_snapshot(time: str, repo: str = "pytorch/pytor return query, parameters -def query_in_queue_jobs_for_given_snapshot(time: str, repo: str = "pytorch/pytorch"): +def get_query_statement_for_queueing_jobs(time: str, repo: str = "pytorch/pytorch"): """ this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers """ @@ -168,6 +413,28 @@ def query_in_queue_jobs_for_given_snapshot(time: str, repo: str = "pytorch/pytor return query, parameters +def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: + auth = Auth.Token(github_access_token) + test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra") + pytorch_repo = Github(auth=auth).get_repo("pytorch/pytorch") + + meta_runner_config_retriever = LazyFileHistory( + test_infra_repo, ".github/scale-config.yml" + ) + lf_runner_config_retriever = LazyFileHistory( + test_infra_repo, ".github/lf-scale-config.yml" + ) + old_lf_lf_runner_config_retriever = LazyFileHistory( + pytorch_repo, ".github/lf-scale-config.yml" + ) + + return ( + meta_runner_config_retriever, + lf_runner_config_retriever, + old_lf_lf_runner_config_retriever, + ) + + class QueueTimeProcessor: """ this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time @@ -185,24 +452,33 @@ def __init__( self.is_dry_run = is_dry_run def process(self) -> None: - self.proceses_job_queue_times_historical() - - def proceses_job_queue_times_historical( - self, snapshot_time: str = "", repo: str = "pytorch/pytorch" - ) -> None: - # by default, we use current time as snapshot - timestamp = str(int(datetime.now().timestamp())) - if snapshot_time: - timestamp = snapshot_time + github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "") + if not github_access_token: + raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") + + ( + meta_runner_config_retriever, + lf_runner_config_retriever, + old_lf_lf_runner_config_retriever, + ) = get_config_retrievers(github_access_token) + self.proceses_job_queue_times_historical( + meta_runner_config_retriever, + lf_runner_config_retriever, + old_lf_lf_runner_config_retriever, + ) - # fetches jobs that were in queue in given snapshot time, that are not being picked up by workers - queued_query, queued_parameters = query_in_queue_jobs_for_given_snapshot( + def snapshot_jobs_in_queue( + self, timestamp: str = "", repo: str = "pytorch/pytorch" + ) -> List[Dict[str, Any]]: + # in given snapshot time, fetches jobs that were in queue but not being picked up by workers + queued_query, queued_parameters = get_query_statement_for_queueing_jobs( timestamp, repo ) jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters) - # fetches jobs that were in queue in given snapshot time, but were picked up by workers later of given snapshot time - picked_query, picked_params = query_picked_up_job_for_given_snapshot( + # in queue in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time + # this happens when the snapshot time is not latest timestamp + picked_query, picked_params = get_query_statement_for_picked_up_job( timestamp, repo ) jobs_pick = self.process_in_queue_jobs(picked_query, picked_params) @@ -214,19 +490,57 @@ def proceses_job_queue_times_historical( info( f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later" ) + result = jobs_in_queue + jobs_pick + return result - if len(jobs_in_queue) == 0 and len(jobs_pick) == 0: - info(f"No jobs were in queue at time {datetime_str}, skipping") + def proceses_job_queue_times_historical( + self, + meta_runner_config_retriever, + lf_runner_config_retriever, + old_lf_lf_runner_config_retriever, + snapshot_time: str = "", + repo: str = "pytorch/pytorch", + ) -> None: + # by default, we use current time as snapshot + timestamp = str(int(datetime.now().timestamp())) + if snapshot_time: + timestamp = snapshot_time + + snapshot = self.snapshot_jobs_in_queue(timestamp, repo) + if len(snapshot) == 0: + info(f"No jobs in queue at time: {timestamp}") return + lf_runner_config = get_runner_config(lf_runner_config_retriever, timestamp) + + if not lf_runner_config or not lf_runner_config["runner_types"]: + lf_runner_config = get_runner_config( + old_lf_lf_runner_config_retriever, timestamp + ) + + # create dictionary of tags with set of targeting machine types + tag_categories = create_tag_categorires( + get_runner_config(meta_runner_config_retriever, timestamp), lf_runner_config + ) + update_tags(tag_categories, set([job["machine_type"] for job in snapshot])) + + # iterate throught jobs, and update tags for each job + for job in snapshot: + job_tags = [] + for tag in tag_categories: + if job["machine_type"] in tag_categories[tag]: + job_tags.append(tag) + job["tags"] = job_tags + key = f"job_queue_times_historical/{repo}/{timestamp}.txt" - result = jobs_in_queue + jobs_pick if self.is_dry_run: - info(f"[Dry Run Mode]: {len(result)} records to S3 {_bucket_name}/{key}") - print(json.dumps(result)) + info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}") + info(json.dumps(snapshot)) return - upload_to_s3_txt(self.s3_client, _bucket_name, key, result) + print("Yang", snapshot) + + upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot) def process_in_queue_jobs( self, queryStr: str, parameters: Any @@ -238,7 +552,6 @@ def process_in_queue_jobs( seen = set() db_resp = self.query(queryStr, parameters) result = [] - for record in db_resp: if record["html_url"] in seen: continue diff --git a/aws/lambda/oss_ci_job_queue_time/requirements.txt b/aws/lambda/oss_ci_job_queue_time/requirements.txt index 3e22fde96f..87c33c2e7f 100644 --- a/aws/lambda/oss_ci_job_queue_time/requirements.txt +++ b/aws/lambda/oss_ci_job_queue_time/requirements.txt @@ -1,2 +1,5 @@ clickhouse_connect==0.8.5 boto3==1.35.33 +PyGithub==1.59.0 +python-dateutil==2.8.2 +PyYAML==6.0.1 diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index 6fbcbc4f74..46412d15a8 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -142,41 +142,53 @@ def set_default_env_variables(): os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1" os.environ["CLICKHOUSE_USERNAME"] = "user1" os.environ["CLICKHOUSE_PASSWORD"] = "pwd1" + os.environ["GITHUB_ACCESS_TOKEN"] = "token1" class Test(unittest.TestCase): - @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") - @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") - def test_lambda_handler_when_row_result_is_empty( - self, mock_get_client, mock_s3_resource - ): + def setUp(self): + patcher1 = patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") + patcher2 = patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") + patcher3 = patch("oss_ci_job_queue_time.lambda_function.get_runner_config") + patcher4 = patch("oss_ci_job_queue_time.lambda_function.get_config_retrievers") + + self.mock_s3_resource = patcher1.start() + self.mock_get_client = patcher2.start() + self.mock_get_runner_config = patcher3.start() + self.mock_get_config_retrievers = patcher4.start() + + self.mock_get_runner_config.return_value = {"runner_types": {}} + self.mock_get_config_retrievers.return_value = ({}, {}, {}) + + self.addCleanup(patcher1.stop) # Ensure patchers stop after each test + self.addCleanup(patcher2.stop) + self.addCleanup(patcher3.stop) + self.addCleanup(patcher4.stop) + + def test_lambda_handler_when_row_result_is_empty(self): print("test_lambda_handler_when_row_result_is_empty ") # prepare set_default_env_variables() - mock_s3_resource_put(mock_s3_resource) - mock_db_client(mock_get_client, [], []) + mock_s3_resource_put(self.mock_s3_resource) + mock_db_client(self.mock_get_client, [], []) # execute lambda_handler(None, None) # assert - mock_get_client.assert_called_once() + self.mock_get_client.assert_called_once() get_mock_s3_resource_object( - mock_s3_resource + self.mock_s3_resource ).return_value.put.assert_not_called() - @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") - @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") - def test_lambda_handler_when_lambda_happy_flow_then_success( - self, mock_get_client, mock_s3_resource - ): + def test_lambda_handler_when_lambda_happy_flow_then_success(self): # prepare set_default_env_variables() - mock_s3_resource_put(mock_s3_resource) - mock_db_client(mock_get_client) + mock_s3_resource_put(self.mock_s3_resource) + mock_db_client(self.mock_get_client) - expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372}\n' - expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372}\n' + expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other"]}\n' + expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["all", "other"]}\n' expected_s3_body = expected_r1 + expected_r2 expect = gzip.compress(expected_s3_body) @@ -186,36 +198,33 @@ def test_lambda_handler_when_lambda_happy_flow_then_success( # assert # assert clickhouse client - mock_get_client.assert_called_once() - self.assertEqual(mock_get_client.return_value.query.call_count, 2) + self.mock_get_client.assert_called_once() + self.assertEqual(self.mock_get_client.return_value.query.call_count, 2) # assert s3 resource - mock_s3_resource.assert_called_once() + self.mock_s3_resource.assert_called_once() get_mock_s3_resource_object( - mock_s3_resource + self.mock_s3_resource ).return_value.put.assert_called_once() get_mock_s3_resource_object( - mock_s3_resource + self.mock_s3_resource ).return_value.put.assert_called_once_with( Body=expect, ContentEncoding="gzip", ContentType="text/plain" ) - @patch("boto3.resource") - @patch("clickhouse_connect.get_client") - def test_lambda_handler_when_missing_required_env_vars_then_throws_error( - self, mock_get_client, mock_s3_resource - ): + def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self): test_cases = [ ("CLICKHOUSE_ENDPOINT"), ("CLICKHOUSE_USERNAME"), ("CLICKHOUSE_PASSWORD"), + ("GITHUB_ACCESS_TOKEN"), ] for x in test_cases: with self.subTest(x=x): # prepare - mock_get_client.reset_mock(return_value=True) - mock_s3_resource.reset_mock(return_value=True) + self.mock_get_client.reset_mock(return_value=True) + self.mock_s3_resource.reset_mock(return_value=True) set_default_env_variables() os.environ[x] = "" @@ -226,20 +235,18 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error( # assert self.assertTrue(x in str(context.exception)) - mock_get_client.return_value.query.assert_not_called() + self.mock_get_client.return_value.query.assert_not_called() get_mock_s3_resource_object( - mock_s3_resource + self.mock_s3_resource ).return_value.put.assert_not_called() - @patch("oss_ci_job_queue_time.lambda_function.get_aws_s3_resource") - @patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_write( - self, mock_get_client, mock_s3_resource + self, ): # prepare set_default_env_variables() - mock_s3_resource_put(mock_s3_resource) - mock_db_client(mock_get_client) + mock_s3_resource_put(self.mock_s3_resource) + mock_db_client(self.mock_get_client) # execute main() @@ -247,13 +254,13 @@ def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_w # assert # assert clickhouse client - mock_get_client.assert_called_once() - self.assertEqual(mock_get_client.return_value.query.call_count, 2) + self.mock_get_client.assert_called_once() + self.assertEqual(self.mock_get_client.return_value.query.call_count, 2) # assert s3 resource - mock_s3_resource.assert_called_once() + self.mock_s3_resource.assert_called_once() get_mock_s3_resource_object( - mock_s3_resource + self.mock_s3_resource ).return_value.put.assert_not_called() From 02881756418f63ecc508709b9b4fd9336aa0c0aa Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 23:53:04 -0700 Subject: [PATCH 18/38] add tags --- .../oss_ci_job_queue_time/lambda_function.py | 54 ++++++++++--------- .../test_lambda_oss_ci_job_queue_time.py | 4 +- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 019bada68f..cd774f9fcd 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -222,6 +222,11 @@ def update_tags( if not machine_type: continue tag_categories["all"].add(machine_type) + + if machine_type.startswith("linux.rocm.gpu"): + tag_categories["linux"].add(machine_type) + tag_categories["linux-amd"].add(machine_type) + if machine_type not in tag_categories["dynamic"]: if "ubuntu" in machine_type.lower(): tag_categories["linux"].add(machine_type) @@ -238,7 +243,7 @@ def create_tag_categorires( Create the tag_categorires, that are groups of runners with some common characteristics that we might find relevant to view them in a group instead of individually. """ - breakdowns = { + tag_dict = { "github": set(), # provided by github "pet": set(), # managed as pet instances "dynamic": set(), # managed as auto-scaling instances @@ -247,6 +252,7 @@ def create_tag_categorires( "linux": set(), # linux instances "linux-meta": set(), # linux instances provided by meta "linux-lf": set(), # linux instances provided by Linux Foundation + "linux-amd": set(), # linux instances provided by amd. for instance linux.rocm.gpu.2 "macos": set(), # macos instances "macos-meta": set(), # macos instances provided by meta "windows": set(), # windows instances @@ -268,8 +274,8 @@ def create_tag_categorires( "macos-14-arm64", "macos-14-xlarge", ) - breakdowns["github"].update(github_mac_runners) - breakdowns["macos"].update(github_mac_runners) + tag_dict["github"].update(github_mac_runners) + tag_dict["macos"].update(github_mac_runners) meta_pet_mac_runners = ( "macos-m1-12", @@ -280,54 +286,53 @@ def create_tag_categorires( "macos-m2-15", "macos-m2-max", ) - breakdowns["meta"].update(meta_pet_mac_runners) - breakdowns["macos"].update(meta_pet_mac_runners) - breakdowns["pet"].update(meta_pet_mac_runners) + tag_dict["meta"].update(meta_pet_mac_runners) + tag_dict["macos"].update(meta_pet_mac_runners) + tag_dict["pet"].update(meta_pet_mac_runners) meta_pet_nvidia = ( "linux.aws.a100", "linux.aws.h100", ) - breakdowns["meta"].update(meta_pet_nvidia) - breakdowns["linux"].update(meta_pet_nvidia) - breakdowns["linux-meta"].update(meta_pet_nvidia) - breakdowns["pet"].update(meta_pet_nvidia) - breakdowns["multi-tenant"].update(meta_pet_nvidia) + tag_dict["meta"].update(meta_pet_nvidia) + tag_dict["linux"].update(meta_pet_nvidia) + tag_dict["linux-meta"].update(meta_pet_nvidia) + tag_dict["pet"].update(meta_pet_nvidia) + tag_dict["multi-tenant"].update(meta_pet_nvidia) all_runners_configs = ( runner_configs["runner_types"] | lf_runner_configs["runner_types"] ) for runner, runner_config in all_runners_configs.items(): - breakdowns["dynamic"].add(runner) + tag_dict["dynamic"].add(runner) if "is_ephemeral" in runner_config and runner_config["is_ephemeral"]: - breakdowns["ephemeral"].add(runner) + tag_dict["ephemeral"].add(runner) else: - breakdowns["nonephemeral"].add(runner) + tag_dict["nonephemeral"].add(runner) if runner_config["os"].lower() == "linux": - breakdowns["linux"].add(runner) + tag_dict["linux"].add(runner) elif runner_config["os"].lower() == "windows": - breakdowns["windows"].add(runner) + tag_dict["windows"].add(runner) for runner, runner_config in runner_configs["runner_types"].items(): - breakdowns["meta"].add(runner) + tag_dict["meta"].add(runner) if runner_config["os"].lower() == "linux": - breakdowns["linux-meta"].add(runner) + tag_dict["linux-meta"].add(runner) elif runner_config["os"].lower() == "windows": - breakdowns["windows-meta"].add(runner) + tag_dict["windows-meta"].add(runner) for runner, runner_config in lf_runner_configs["runner_types"].items(): - breakdowns["lf"].add(runner) + tag_dict["lf"].add(runner) if runner_config["os"].lower() == "linux": - breakdowns["linux-lf"].add(runner) + tag_dict["linux-lf"].add(runner) elif runner_config["os"].lower() == "windows": - breakdowns["windows-lf"].add(runner) - - return breakdowns + tag_dict["windows-lf"].add(runner) + return tag_dict def get_runner_config( @@ -530,6 +535,7 @@ def proceses_job_queue_times_historical( for tag in tag_categories: if job["machine_type"] in tag_categories[tag]: job_tags.append(tag) + job_tags.append(job["machine_type"]) job["tags"] = job_tags key = f"job_queue_times_historical/{repo}/{timestamp}.txt" diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index 46412d15a8..d19dd07205 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -187,8 +187,8 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(self): mock_s3_resource_put(self.mock_s3_resource) mock_db_client(self.mock_get_client) - expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other"]}\n' - expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["all", "other"]}\n' + expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other", "linux.aws.h100"]}\n' + expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["linux", "linux-amd", "all", "other", "linux.rocm.gpu.2"]}\n' expected_s3_body = expected_r1 + expected_r2 expect = gzip.compress(expected_s3_body) From d3f4cf7cab10fca81b5cf402b325b8d80441be65 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 18 Mar 2025 23:57:16 -0700 Subject: [PATCH 19/38] add tags --- aws/lambda/oss_ci_job_queue_time/lambda_function.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index cd774f9fcd..b2feef02fa 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -511,19 +511,18 @@ def proceses_job_queue_times_historical( if snapshot_time: timestamp = snapshot_time + # fetch jobs in queue at given snapshot time snapshot = self.snapshot_jobs_in_queue(timestamp, repo) if len(snapshot) == 0: info(f"No jobs in queue at time: {timestamp}") return + # create dictionary of tags with set of targeting machine types lf_runner_config = get_runner_config(lf_runner_config_retriever, timestamp) - if not lf_runner_config or not lf_runner_config["runner_types"]: lf_runner_config = get_runner_config( old_lf_lf_runner_config_retriever, timestamp ) - - # create dictionary of tags with set of targeting machine types tag_categories = create_tag_categorires( get_runner_config(meta_runner_config_retriever, timestamp), lf_runner_config ) From c80241a78822aba015417f993fbae8804907bb95 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:09:45 -0700 Subject: [PATCH 20/38] add tags --- .../oss_ci_job_queue_time/lambda_function.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index b2feef02fa..987e4fa5e1 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -53,6 +53,8 @@ @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: + + # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True, verify=False ) @@ -450,11 +452,12 @@ class QueueTimeProcessor: """ def __init__( - self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False + self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False, local_output: bool = False ) -> None: self.clickhouse_client = clickhouse_client self.s3_client = s3_client self.is_dry_run = is_dry_run + self.local_output = local_output and is_dry_run def process(self) -> None: github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "") @@ -540,10 +543,15 @@ def proceses_job_queue_times_historical( key = f"job_queue_times_historical/{repo}/{timestamp}.txt" if self.is_dry_run: info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}") - info(json.dumps(snapshot)) + if self.local_output: + file_name = f"job_queue_times_historical_snapshot_{timestamp}.json" + info(f"[Dry Run Mode]: local output to {file_name}.json") + with open(file_name, "w") as f: + f.write(json.dumps(snapshot)) + else: + info(json.dumps(snapshot)) return - print("Yang", snapshot) upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot) @@ -620,6 +628,17 @@ def parse_args() -> argparse.Namespace: default=os.getenv("CLICKHOUSE_PASSWORD", ""), help="the clickhouse password for the user name", ) + parser.add_argument( + "--github-access-token", + type=str, + default=os.getenv("GITHUB_ACCESS_TOKEN", ""), + help="the github access token to access github api", + ) + parser.add_argument( + "--local-output", + action="store_true", + help="when set, generate json result in local environment. this is only used for local test environment when dry-run is enabled", + ) parser.add_argument( "--not-dry-run", action="store_true", @@ -640,6 +659,7 @@ def main() -> None: os.environ["CLICKHOUSE_ENDPOINT"] = arguments.clickhouse_endpoint os.environ["CLICKHOUSE_USERNAME"] = arguments.clickhouse_username os.environ["CLICKHOUSE_PASSWORD"] = arguments.clickhouse_password + os.environ["GITHUB_ACCESS_TOKEN"] = arguments.github_access_token db_client = get_clickhouse_client_environment() s3_client = get_aws_s3_resource() @@ -647,7 +667,7 @@ def main() -> None: # always run in dry-run mode in local environment, unless it's disabled. is_dry_run = not arguments.not_dry_run - QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run).process() + QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output).process() if __name__ == "__main__": From ce3b8a55b6770b886ee49cfc053ad225ce98ef40 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:12:45 -0700 Subject: [PATCH 21/38] add tags --- .github/workflows/tests.yml | 2 +- aws/lambda/tests/test_requirements.txt | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 aws/lambda/tests/test_requirements.txt diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 819f028e41..310db1a554 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -51,7 +51,7 @@ jobs: echo ::group::setup Python environment python -m venv .venv/ source .venv/bin/activate - pip install pip==23.0.1 pytest==7.2.0 boto3==1.35.33 clickhouse-connect==0.8.5 + pip install -r aws/lambda/tests/test_requirements.txt echo ::endgroup:: # Test aws lambda diff --git a/aws/lambda/tests/test_requirements.txt b/aws/lambda/tests/test_requirements.txt new file mode 100644 index 0000000000..87c33c2e7f --- /dev/null +++ b/aws/lambda/tests/test_requirements.txt @@ -0,0 +1,5 @@ +clickhouse_connect==0.8.5 +boto3==1.35.33 +PyGithub==1.59.0 +python-dateutil==2.8.2 +PyYAML==6.0.1 From 7eefe8de3de2bf86a05a38510a423b8e0312a1d6 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:20:01 -0700 Subject: [PATCH 22/38] add tags --- .../oss_ci_job_queue_time/lambda_function.py | 95 ++++++++++--------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 987e4fa5e1..c61036ae20 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -53,7 +53,6 @@ @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: - # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True, verify=False @@ -215,7 +214,7 @@ def explode_runner_variants( def update_tags( - tag_categories: Dict[str, Set[str]], machine_types: Iterable[str] + runner_labels: Dict[str, Set[str]], machine_types: Iterable[str] ) -> None: """ iterate through machine types from jobs, and update potential tags that it belongs to @@ -223,29 +222,29 @@ def update_tags( for machine_type in machine_types: if not machine_type: continue - tag_categories["all"].add(machine_type) + runner_labels["all"].add(machine_type) if machine_type.startswith("linux.rocm.gpu"): - tag_categories["linux"].add(machine_type) - tag_categories["linux-amd"].add(machine_type) + runner_labels["linux"].add(machine_type) + runner_labels["linux-amd"].add(machine_type) - if machine_type not in tag_categories["dynamic"]: + if machine_type not in runner_labels["dynamic"]: if "ubuntu" in machine_type.lower(): - tag_categories["linux"].add(machine_type) - tag_categories["github"].add(machine_type) + runner_labels["linux"].add(machine_type) + runner_labels["github"].add(machine_type) else: - tag_categories["other"].add(machine_type) + runner_labels["other"].add(machine_type) -def create_tag_categorires( +def create_runner_labels( runner_configs: Dict[str, Dict[str, Any]], lf_runner_configs: Dict[str, Dict[str, Any]], ) -> Dict[str, Set[str]]: """ - Create the tag_categorires, that are groups of runners with some common characteristics that we might find relevant + Create the runner_labels, that are groups of runners with some common characteristics that we might find relevant to view them in a group instead of individually. """ - tag_dict = { + runner_labels_dict = { "github": set(), # provided by github "pet": set(), # managed as pet instances "dynamic": set(), # managed as auto-scaling instances @@ -276,8 +275,8 @@ def create_tag_categorires( "macos-14-arm64", "macos-14-xlarge", ) - tag_dict["github"].update(github_mac_runners) - tag_dict["macos"].update(github_mac_runners) + runner_labels_dict["github"].update(github_mac_runners) + runner_labels_dict["macos"].update(github_mac_runners) meta_pet_mac_runners = ( "macos-m1-12", @@ -288,53 +287,53 @@ def create_tag_categorires( "macos-m2-15", "macos-m2-max", ) - tag_dict["meta"].update(meta_pet_mac_runners) - tag_dict["macos"].update(meta_pet_mac_runners) - tag_dict["pet"].update(meta_pet_mac_runners) + runner_labels_dict["meta"].update(meta_pet_mac_runners) + runner_labels_dict["macos"].update(meta_pet_mac_runners) + runner_labels_dict["pet"].update(meta_pet_mac_runners) meta_pet_nvidia = ( "linux.aws.a100", "linux.aws.h100", ) - tag_dict["meta"].update(meta_pet_nvidia) - tag_dict["linux"].update(meta_pet_nvidia) - tag_dict["linux-meta"].update(meta_pet_nvidia) - tag_dict["pet"].update(meta_pet_nvidia) - tag_dict["multi-tenant"].update(meta_pet_nvidia) + runner_labels_dict["meta"].update(meta_pet_nvidia) + runner_labels_dict["linux"].update(meta_pet_nvidia) + runner_labels_dict["linux-meta"].update(meta_pet_nvidia) + runner_labels_dict["pet"].update(meta_pet_nvidia) + runner_labels_dict["multi-tenant"].update(meta_pet_nvidia) all_runners_configs = ( runner_configs["runner_types"] | lf_runner_configs["runner_types"] ) for runner, runner_config in all_runners_configs.items(): - tag_dict["dynamic"].add(runner) + runner_labels_dict["dynamic"].add(runner) if "is_ephemeral" in runner_config and runner_config["is_ephemeral"]: - tag_dict["ephemeral"].add(runner) + runner_labels_dict["ephemeral"].add(runner) else: - tag_dict["nonephemeral"].add(runner) + runner_labels_dict["nonephemeral"].add(runner) if runner_config["os"].lower() == "linux": - tag_dict["linux"].add(runner) + runner_labels_dict["linux"].add(runner) elif runner_config["os"].lower() == "windows": - tag_dict["windows"].add(runner) + runner_labels_dict["windows"].add(runner) for runner, runner_config in runner_configs["runner_types"].items(): - tag_dict["meta"].add(runner) + runner_labels_dict["meta"].add(runner) if runner_config["os"].lower() == "linux": - tag_dict["linux-meta"].add(runner) + runner_labels_dict["linux-meta"].add(runner) elif runner_config["os"].lower() == "windows": - tag_dict["windows-meta"].add(runner) + runner_labels_dict["windows-meta"].add(runner) for runner, runner_config in lf_runner_configs["runner_types"].items(): - tag_dict["lf"].add(runner) + runner_labels_dict["lf"].add(runner) if runner_config["os"].lower() == "linux": - tag_dict["linux-lf"].add(runner) + runner_labels_dict["linux-lf"].add(runner) elif runner_config["os"].lower() == "windows": - tag_dict["windows-lf"].add(runner) - return tag_dict + runner_labels_dict["windows-lf"].add(runner) + return runner_labels_dict def get_runner_config( @@ -452,7 +451,11 @@ class QueueTimeProcessor: """ def __init__( - self, clickhouse_client: Any, s3_client: Any, is_dry_run: bool = False, local_output: bool = False + self, + clickhouse_client: Any, + s3_client: Any, + is_dry_run: bool = False, + local_output: bool = False, ) -> None: self.clickhouse_client = clickhouse_client self.s3_client = s3_client @@ -526,19 +529,20 @@ def proceses_job_queue_times_historical( lf_runner_config = get_runner_config( old_lf_lf_runner_config_retriever, timestamp ) - tag_categories = create_tag_categorires( + runner_labels = create_runner_labels( get_runner_config(meta_runner_config_retriever, timestamp), lf_runner_config ) - update_tags(tag_categories, set([job["machine_type"] for job in snapshot])) + update_tags(runner_labels, set([job["machine_type"] for job in snapshot])) # iterate throught jobs, and update tags for each job for job in snapshot: - job_tags = [] - for tag in tag_categories: - if job["machine_type"] in tag_categories[tag]: - job_tags.append(tag) - job_tags.append(job["machine_type"]) - job["tags"] = job_tags + job_labels = [] + for tag in runner_labels: + if job["machine_type"] in runner_labels[tag]: + job_labels.append(tag) + # add job's own machine type to runner labels + job_labels.append(job["machine_type"]) + job["runner_labels"] = job_labels key = f"job_queue_times_historical/{repo}/{timestamp}.txt" if self.is_dry_run: @@ -552,7 +556,6 @@ def proceses_job_queue_times_historical( info(json.dumps(snapshot)) return - upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot) def process_in_queue_jobs( @@ -667,7 +670,9 @@ def main() -> None: # always run in dry-run mode in local environment, unless it's disabled. is_dry_run = not arguments.not_dry_run - QueueTimeProcessor(db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output).process() + QueueTimeProcessor( + db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output + ).process() if __name__ == "__main__": From 70542e15fd5a8f80db6ef0128c5294f606f71fa7 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:23:19 -0700 Subject: [PATCH 23/38] add tags --- aws/lambda/tests/test_requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aws/lambda/tests/test_requirements.txt b/aws/lambda/tests/test_requirements.txt index 87c33c2e7f..b6abeffc7b 100644 --- a/aws/lambda/tests/test_requirements.txt +++ b/aws/lambda/tests/test_requirements.txt @@ -3,3 +3,5 @@ boto3==1.35.33 PyGithub==1.59.0 python-dateutil==2.8.2 PyYAML==6.0.1 +pip==23.0.1 +pytest==7.2.0 From b5d2a49d66c6dc53727cf31ac60c4e00591be70f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:27:40 -0700 Subject: [PATCH 24/38] add tags --- aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index d19dd07205..de7bfd1a93 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -187,8 +187,8 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(self): mock_s3_resource_put(self.mock_s3_resource) mock_db_client(self.mock_get_client) - expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "tags": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other", "linux.aws.h100"]}\n' - expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "tags": ["linux", "linux-amd", "all", "other", "linux.rocm.gpu.2"]}\n' + expected_r1 = b'{"queue_s": 60000, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-1", "job_name": "job-name-1", "html_url": "runs/1/job/1", "machine_type": "linux.aws.h100", "time": 1742262372, "runner_labels": ["pet", "linux", "linux-meta", "all", "meta", "multi-tenant", "other", "linux.aws.h100"]}\n' + expected_r2 = b'{"queue_s": 1400, "repo": "pytorch/pytorch", "workflow_name": "workflow-name-2", "job_name": "job-name-2", "html_url": "runs/2/job/2", "machine_type": "linux.rocm.gpu.2", "time": 1742262372, "runner_labels": ["linux", "linux-amd", "all", "other", "linux.rocm.gpu.2"]}\n' expected_s3_body = expected_r1 + expected_r2 expect = gzip.compress(expected_s3_body) From c3a63527b9264fe4d33abd0708f5e56b195c70e8 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:29:52 -0700 Subject: [PATCH 25/38] add tags --- .../schema.sql | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql diff --git a/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql b/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql new file mode 100644 index 0000000000..a310278d35 --- /dev/null +++ b/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql @@ -0,0 +1,30 @@ + -- This table is used to store queue time histogram +CREATE TABLE misc.oss_ci_queue_time_histogram( + -- the type of histogram, currently we store two types of histogram: + -- 'in-queue-histogram','completed-queue-histogram' + `type` String, + `repo` String DEFAULT 'pytorch/pytorch', + `workflow_name` String, + `job_name` String, + `machine_type` String, + `histogram_version` String, + `histogram` Array(UInt64), + `max_queue_time` UInt64, + `avg_queue_time` UInt64, + `total_count` UInt64, + `time` DateTime64(9), + `runner_labels` Array(String), + `extra_info` Map(String,String) +) +ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}') +PARTITION BY toYYYYMM(time) +ORDER BY ( + type, + repo, + time, + machine_type, + job_name, + workflow_name, +) +TTL toDate(time) + toIntervalYear(5) +SETTINGS index_granularity = 8192 From 8c24747249167b5561076f9c27da719b21d36ce8 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:33:05 -0700 Subject: [PATCH 26/38] ares --- .../schema.sql | 30 ------------------- 1 file changed, 30 deletions(-) delete mode 100644 clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql diff --git a/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql b/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql deleted file mode 100644 index a310278d35..0000000000 --- a/clickhouse_db_schema/oss_ci_job_queue_time_histogram/schema.sql +++ /dev/null @@ -1,30 +0,0 @@ - -- This table is used to store queue time histogram -CREATE TABLE misc.oss_ci_queue_time_histogram( - -- the type of histogram, currently we store two types of histogram: - -- 'in-queue-histogram','completed-queue-histogram' - `type` String, - `repo` String DEFAULT 'pytorch/pytorch', - `workflow_name` String, - `job_name` String, - `machine_type` String, - `histogram_version` String, - `histogram` Array(UInt64), - `max_queue_time` UInt64, - `avg_queue_time` UInt64, - `total_count` UInt64, - `time` DateTime64(9), - `runner_labels` Array(String), - `extra_info` Map(String,String) -) -ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}') -PARTITION BY toYYYYMM(time) -ORDER BY ( - type, - repo, - time, - machine_type, - job_name, - workflow_name, -) -TTL toDate(time) + toIntervalYear(5) -SETTINGS index_granularity = 8192 From a60834e0ea58eaab6b38f02e07887aa5e0b3faa9 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 00:33:32 -0700 Subject: [PATCH 27/38] typo --- torchci/pages/api/clickhouse/[queryName].ts | 1 - 1 file changed, 1 deletion(-) diff --git a/torchci/pages/api/clickhouse/[queryName].ts b/torchci/pages/api/clickhouse/[queryName].ts index 01c4f0f51d..e0461e5982 100644 --- a/torchci/pages/api/clickhouse/[queryName].ts +++ b/torchci/pages/api/clickhouse/[queryName].ts @@ -10,6 +10,5 @@ export default async function handler( queryName, JSON.parse(req.query.parameters as string) ); - res.status(200).json(response); } From 2f472982e14a411a2e7ad911cb447e5a5537a2dc Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:07:29 -0700 Subject: [PATCH 28/38] typo --- .../oss_ci_job_queue_time/lambda_function.py | 58 +++++++++++-------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index c61036ae20..b17b24f2e3 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -472,13 +472,37 @@ def process(self) -> None: lf_runner_config_retriever, old_lf_lf_runner_config_retriever, ) = get_config_retrievers(github_access_token) - self.proceses_job_queue_times_historical( + + + # by default, we use current time as snapshot + timestamp = str(int(datetime.now().timestamp())) + + snapshot = self.get_jobs_in_queue_snapshot( meta_runner_config_retriever, lf_runner_config_retriever, old_lf_lf_runner_config_retriever, + timestamp, + "pytorch/pytorch", ) - def snapshot_jobs_in_queue( + # TODO(elainewy): add logic to generate histograms based on the snapshot + self.output(snapshot, timestamp, "pytorch/pytorch") + + def output(self, snapshot: List[Dict[str, Any]], timestamp: str, repo:str ='pytorch/pytorch') -> None: + # key = f"job_queue_times_histogram/{repo}/{timestamp}.txt" + if self.is_dry_run: + info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}") + if self.local_output: + file_name = f"job_queue_times_snapshot_{timestamp}.json" + info(f"[Dry Run Mode]: local output to {file_name}.json") + with open(file_name, "w") as f: + f.write(json.dumps(snapshot)) + + info(json.dumps(snapshot)) + return + # upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot) + + def query_queueing_jobs( self, timestamp: str = "", repo: str = "pytorch/pytorch" ) -> List[Dict[str, Any]]: # in given snapshot time, fetches jobs that were in queue but not being picked up by workers @@ -504,24 +528,20 @@ def snapshot_jobs_in_queue( result = jobs_in_queue + jobs_pick return result - def proceses_job_queue_times_historical( + def get_jobs_in_queue_snapshot( self, meta_runner_config_retriever, lf_runner_config_retriever, old_lf_lf_runner_config_retriever, - snapshot_time: str = "", + timestamp: str, repo: str = "pytorch/pytorch", - ) -> None: - # by default, we use current time as snapshot - timestamp = str(int(datetime.now().timestamp())) - if snapshot_time: - timestamp = snapshot_time + ) -> List[Dict[str, Any]]: - # fetch jobs in queue at given snapshot time - snapshot = self.snapshot_jobs_in_queue(timestamp, repo) + # fetches jobs in queue at given snapshot time from db + snapshot = self.query_queueing_jobs(timestamp, repo) if len(snapshot) == 0: info(f"No jobs in queue at time: {timestamp}") - return + return [] # create dictionary of tags with set of targeting machine types lf_runner_config = get_runner_config(lf_runner_config_retriever, timestamp) @@ -544,19 +564,7 @@ def proceses_job_queue_times_historical( job_labels.append(job["machine_type"]) job["runner_labels"] = job_labels - key = f"job_queue_times_historical/{repo}/{timestamp}.txt" - if self.is_dry_run: - info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}") - if self.local_output: - file_name = f"job_queue_times_historical_snapshot_{timestamp}.json" - info(f"[Dry Run Mode]: local output to {file_name}.json") - with open(file_name, "w") as f: - f.write(json.dumps(snapshot)) - else: - info(json.dumps(snapshot)) - return - - upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot) + return snapshot def process_in_queue_jobs( self, queryStr: str, parameters: Any From a6b8113c9b21de880d1bb763bcdba666bef42f3e Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:18:57 -0700 Subject: [PATCH 29/38] typo --- .../oss_ci_job_queue_time/lambda_function.py | 20 +++++++++---------- .../test_lambda_oss_ci_job_queue_time.py | 7 +------ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index b17b24f2e3..69144e0bd8 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -473,7 +473,6 @@ def process(self) -> None: old_lf_lf_runner_config_retriever, ) = get_config_retrievers(github_access_token) - # by default, we use current time as snapshot timestamp = str(int(datetime.now().timestamp())) @@ -485,22 +484,24 @@ def process(self) -> None: "pytorch/pytorch", ) - # TODO(elainewy): add logic to generate histograms based on the snapshot - self.output(snapshot, timestamp, "pytorch/pytorch") + self.output_snapshot(snapshot) + # TODO(elainewy): add logics to generate histograms based on the snapshot - def output(self, snapshot: List[Dict[str, Any]], timestamp: str, repo:str ='pytorch/pytorch') -> None: - # key = f"job_queue_times_histogram/{repo}/{timestamp}.txt" + def output_snapshot( + self, + snapshot: List[Dict[str, Any]], + ) -> None: if self.is_dry_run: - info(f"[Dry Run Mode]: {len(snapshot)} records to S3 {_bucket_name}/{key}") + info( + f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot" + ) if self.local_output: file_name = f"job_queue_times_snapshot_{timestamp}.json" info(f"[Dry Run Mode]: local output to {file_name}.json") with open(file_name, "w") as f: f.write(json.dumps(snapshot)) - info(json.dumps(snapshot)) return - # upload_to_s3_txt(self.s3_client, _bucket_name, key, snapshot) def query_queueing_jobs( self, timestamp: str = "", repo: str = "pytorch/pytorch" @@ -523,7 +524,7 @@ def query_queueing_jobs( ) info( - f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by workers later" + f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by runners" ) result = jobs_in_queue + jobs_pick return result @@ -536,7 +537,6 @@ def get_jobs_in_queue_snapshot( timestamp: str, repo: str = "pytorch/pytorch", ) -> List[Dict[str, Any]]: - # fetches jobs in queue at given snapshot time from db snapshot = self.query_queueing_jobs(timestamp, repo) if len(snapshot) == 0: diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index de7bfd1a93..98711d35c0 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -205,12 +205,7 @@ def test_lambda_handler_when_lambda_happy_flow_then_success(self): self.mock_s3_resource.assert_called_once() get_mock_s3_resource_object( self.mock_s3_resource - ).return_value.put.assert_called_once() - get_mock_s3_resource_object( - self.mock_s3_resource - ).return_value.put.assert_called_once_with( - Body=expect, ContentEncoding="gzip", ContentType="text/plain" - ) + ).return_value.put.assert_not_called() def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self): test_cases = [ From aa1d08c0136d0f5778c6277793eae9a77026c3d9 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:22:24 -0700 Subject: [PATCH 30/38] typo --- .../oss_ci_job_queue_time/lambda_function.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 69144e0bd8..e4ddfe25b9 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -484,24 +484,27 @@ def process(self) -> None: "pytorch/pytorch", ) - self.output_snapshot(snapshot) - # TODO(elainewy): add logics to generate histograms based on the snapshot + if self.is_dry_run: + self.output_snapshot(snapshot) + # TODO(elainewy): add logics to generate histograms based on the snapshot results def output_snapshot( self, snapshot: List[Dict[str, Any]], ) -> None: - if self.is_dry_run: - info( - f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot" - ) - if self.local_output: - file_name = f"job_queue_times_snapshot_{timestamp}.json" - info(f"[Dry Run Mode]: local output to {file_name}.json") - with open(file_name, "w") as f: - f.write(json.dumps(snapshot)) - info(json.dumps(snapshot)) - return + """ + print the snapshot to local file or terminal for local test + """ + info( + f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot" + ) + if self.local_output: + file_name = f"job_queue_times_snapshot_{timestamp}.json" + info(f"[Dry Run Mode]: local output to {file_name}.json") + with open(file_name, "w") as f: + f.write(json.dumps(snapshot)) + info(json.dumps(snapshot)) + return def query_queueing_jobs( self, timestamp: str = "", repo: str = "pytorch/pytorch" From b8a1086d87d175fb9641c927ec093a44f56ae4ab Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:24:09 -0700 Subject: [PATCH 31/38] typo --- aws/lambda/oss_ci_job_queue_time/lambda_function.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index e4ddfe25b9..dbd0e73738 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -484,17 +484,20 @@ def process(self) -> None: "pytorch/pytorch", ) - if self.is_dry_run: - self.output_snapshot(snapshot) + self.output_snapshot(snapshot,timestamp) # TODO(elainewy): add logics to generate histograms based on the snapshot results def output_snapshot( self, snapshot: List[Dict[str, Any]], + timestamp: str, ) -> None: """ - print the snapshot to local file or terminal for local test + print the snapshot to local file or terminal for local test only """ + if not self.is_dry_run: + return + info( f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot" ) From e91f959b8f7369fc71efb29cc540991dcc8dc38c Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:25:36 -0700 Subject: [PATCH 32/38] typo --- aws/lambda/oss_ci_job_queue_time/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index dbd0e73738..8158ed6b91 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -473,7 +473,7 @@ def process(self) -> None: old_lf_lf_runner_config_retriever, ) = get_config_retrievers(github_access_token) - # by default, we use current time as snapshot + # use current time as snapshot time timestamp = str(int(datetime.now().timestamp())) snapshot = self.get_jobs_in_queue_snapshot( From 6b3b8890552ccd69eff5d185891ef20931b4780d Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:27:23 -0700 Subject: [PATCH 33/38] typo --- aws/lambda/oss_ci_job_queue_time/lambda_function.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 8158ed6b91..91d4d70fe9 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -418,7 +418,6 @@ def get_query_statement_for_queueing_jobs(time: str, repo: str = "pytorch/pytorc } return query, parameters - def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: auth = Auth.Token(github_access_token) test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra") @@ -440,7 +439,6 @@ def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: old_lf_lf_runner_config_retriever, ) - class QueueTimeProcessor: """ this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time From 4d8440fd1efe1f941dfb7573b470c25a8106622f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:41:35 -0700 Subject: [PATCH 34/38] reform code --- .../oss_ci_job_queue_time/lambda_function.py | 193 +++++++++--------- 1 file changed, 101 insertions(+), 92 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 91d4d70fe9..5c9312caf8 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -22,8 +22,8 @@ logging.basicConfig(level=logging.INFO) - _bucket_name = "ossci-raw-job-status" + _in_queue_job_select_statement = """ SELECT DATE_DIFF( @@ -53,9 +53,11 @@ @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: + # for local testing only, disable SSL verification # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) + return clickhouse_connect.get_client( - host=host, user=user, password=password, secure=True, verify=False + host=host, user=user, password=password, secure=True ) @@ -345,79 +347,6 @@ def get_runner_config( return {"runner_types": {}} -def get_query_statement_for_picked_up_job(time: str, repo: str = "pytorch/pytorch"): - """ - this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later - """ - s1 = """ - WITH possible_queued_jobs AS ( - SELECT - id, - run_id, - started_at, - created_at - FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed - WHERE - started_at > ({timestamp:DateTime}) - AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE) - AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) - )""" - - s2 = """ - WHERE - job.id IN (SELECT id FROM possible_queued_jobs) - AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) - AND workflow.repository.'full_name' = {repo:String} - AND job.status = 'completed' - AND LENGTH(job.steps) != 0 - AND workflow.status = 'completed' - ORDER BY - queue_s DESC - """ - query = s1 + _in_queue_job_select_statement + s2 - parameters = { - "timestamp": time, - "repo": repo, - } - return query, parameters - - -def get_query_statement_for_queueing_jobs(time: str, repo: str = "pytorch/pytorch"): - """ - this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers - """ - s1 = """ - WITH possible_queued_jobs AS ( - SELECT - id, - run_id, - started_at, - created_at - FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed - WHERE - status = 'queued' - AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE) - AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) - ) - """ - s2 = """ - WHERE - job.id IN (SELECT id FROM possible_queued_jobs) - AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) - AND workflow.repository.'full_name' = {repo:String} - AND job.status = 'queued' - AND LENGTH(job.steps) = 0 - AND workflow.status != 'completed' - ORDER BY - queue_s DESC - """ - query = s1 + _in_queue_job_select_statement + s2 - parameters = { - "timestamp": time, - "repo": repo, - } - return query, parameters - def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: auth = Auth.Token(github_access_token) test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra") @@ -439,6 +368,7 @@ def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: old_lf_lf_runner_config_retriever, ) + class QueueTimeProcessor: """ this class used to handle oss ci queue time data aggregations. Currently it fetches in-queue jobs from clickhouse at current time @@ -465,6 +395,7 @@ def process(self) -> None: if not github_access_token: raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") + # get runner config retrievers ( meta_runner_config_retriever, lf_runner_config_retriever, @@ -474,7 +405,7 @@ def process(self) -> None: # use current time as snapshot time timestamp = str(int(datetime.now().timestamp())) - snapshot = self.get_jobs_in_queue_snapshot( + snapshot = self.get_queueing_jobs_snapshot( meta_runner_config_retriever, lf_runner_config_retriever, old_lf_lf_runner_config_retriever, @@ -482,7 +413,7 @@ def process(self) -> None: "pytorch/pytorch", ) - self.output_snapshot(snapshot,timestamp) + self.output_snapshot(snapshot, timestamp) # TODO(elainewy): add logics to generate histograms based on the snapshot results def output_snapshot( @@ -507,33 +438,33 @@ def output_snapshot( info(json.dumps(snapshot)) return - def query_queueing_jobs( + def _fetch_snapshot_from_db( self, timestamp: str = "", repo: str = "pytorch/pytorch" ) -> List[Dict[str, Any]]: # in given snapshot time, fetches jobs that were in queue but not being picked up by workers - queued_query, queued_parameters = get_query_statement_for_queueing_jobs( + queued_query, queued_parameters = self.get_query_statement_for_queueing_jobs( timestamp, repo ) - jobs_in_queue = self.process_in_queue_jobs(queued_query, queued_parameters) + jobs_in_queue = self._query_in_queue_jobs(queued_query, queued_parameters) - # in queue in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time - # this happens when the snapshot time is not latest timestamp - picked_query, picked_params = get_query_statement_for_picked_up_job( + # in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time + # this happens when the snapshot time is not in latest timestamp + picked_query, picked_params = self.get_query_statement_for_picked_up_job( timestamp, repo ) - jobs_pick = self.process_in_queue_jobs(picked_query, picked_params) + jobs_pick = self._query_in_queue_jobs(picked_query, picked_params) datetime_str = datetime.fromtimestamp(int(timestamp)).strftime( "%Y-%m-%d %H:%M:%S" ) info( - f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by runners" + f"[Snapshot time:{datetime_str}]. Found {len(jobs_in_queue)} jobs in queue, and {len(jobs_pick)} jobs was in queue but picked up by runners later" ) result = jobs_in_queue + jobs_pick return result - def get_jobs_in_queue_snapshot( + def get_queueing_jobs_snapshot( self, meta_runner_config_retriever, lf_runner_config_retriever, @@ -541,8 +472,12 @@ def get_jobs_in_queue_snapshot( timestamp: str, repo: str = "pytorch/pytorch", ) -> List[Dict[str, Any]]: - # fetches jobs in queue at given snapshot time from db - snapshot = self.query_queueing_jobs(timestamp, repo) + """ + this method is used to fetch jobs that were in queue in given snapshot time + """ + + # fetches queued jobs at given snapshot time from db + snapshot = self._fetch_snapshot_from_db(timestamp, repo) if len(snapshot) == 0: info(f"No jobs in queue at time: {timestamp}") return [] @@ -558,19 +493,17 @@ def get_jobs_in_queue_snapshot( ) update_tags(runner_labels, set([job["machine_type"] for job in snapshot])) - # iterate throught jobs, and update tags for each job + # iterates throught jobs, and update tags for each job for job in snapshot: job_labels = [] for tag in runner_labels: if job["machine_type"] in runner_labels[tag]: job_labels.append(tag) - # add job's own machine type to runner labels - job_labels.append(job["machine_type"]) job["runner_labels"] = job_labels return snapshot - def process_in_queue_jobs( + def _query_in_queue_jobs( self, queryStr: str, parameters: Any ) -> list[dict[str, Any]]: """ @@ -607,6 +540,82 @@ def _to_query_result_dict( li.append(record) return li + def get_query_statement_for_picked_up_job( + self, time: str, repo: str = "pytorch/pytorch" + ): + """ + this query is used to get jobs that were in queue in given snapshot time, but were picked up by workers later + """ + s1 = """ + WITH possible_queued_jobs AS ( + SELECT + id, + run_id, + started_at, + created_at + FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed + WHERE + started_at > ({timestamp:DateTime}) + AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE) + AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) + )""" + + s2 = """ + WHERE + job.id IN (SELECT id FROM possible_queued_jobs) + AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) + AND workflow.repository.'full_name' = {repo:String} + AND job.status = 'completed' + AND LENGTH(job.steps) != 0 + AND workflow.status = 'completed' + ORDER BY + queue_s DESC + """ + query = s1 + _in_queue_job_select_statement + s2 + parameters = { + "timestamp": time, + "repo": repo, + } + return query, parameters + + def get_query_statement_for_queueing_jobs( + self, time: str, repo: str = "pytorch/pytorch" + ): + """ + this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers + """ + s1 = """ + WITH possible_queued_jobs AS ( + SELECT + id, + run_id, + started_at, + created_at + FROM default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed + WHERE + status = 'queued' + AND created_at < ({timestamp:DateTime} - INTERVAL 5 MINUTE) + AND created_at > ({timestamp:DateTime} - INTERVAL 1 WEEK) + ) + """ + s2 = """ + WHERE + job.id IN (SELECT id FROM possible_queued_jobs) + AND workflow.id IN (SELECT run_id FROM possible_queued_jobs) + AND workflow.repository.'full_name' = {repo:String} + AND job.status = 'queued' + AND LENGTH(job.steps) = 0 + AND workflow.status != 'completed' + ORDER BY + queue_s DESC + """ + query = s1 + _in_queue_job_select_statement + s2 + parameters = { + "timestamp": time, + "repo": repo, + } + return query, parameters + def lambda_handler(event: Any, context: Any) -> None: """ From 3698aa62790f9744c2c4ce479a4087f2bc22f8d2 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 02:43:37 -0700 Subject: [PATCH 35/38] comment --- aws/lambda/oss_ci_job_queue_time/lambda_function.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 5c9312caf8..df0da3f505 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -6,7 +6,6 @@ import os import gzip import threading -import dateutil.parser import yaml import boto3 # type: ignore[import] @@ -54,7 +53,7 @@ @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: # for local testing only, disable SSL verification - # clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) + # return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True @@ -182,7 +181,7 @@ def _fetch_until_timestamp(self, timestamp: datetime) -> Optional[str]: return self._find_earliest_after_in_cache(timestamp) - def _fetch_content_for_commit(self, commit: any) -> str: + def _fetch_content_for_commit(self, commit: Any) -> str: if commit.sha not in self._content_cache: print( f"Fetching content for {self.repo} : {self.path} at {commit.commit.author.date} - {commit.sha}" From 1d9c8475d47146b5005c33da3c51fbc54fb304ac Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 15:46:25 -0700 Subject: [PATCH 36/38] comment --- .../oss_ci_job_queue_time/lambda_function.py | 56 ++++++++++++------- .../test_lambda_oss_ci_job_queue_time.py | 33 ++++++----- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index df0da3f505..9e435e5ebb 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -19,10 +19,16 @@ from github import Github, Auth from dateutil.parser import parse +ENVS = { + "GITHUB_ACCESS_TOKEN": os.getenv("GITHUB_ACCESS_TOKEN", ""), + "CLICKHOUSE_ENDPOINT": os.getenv("CLICKHOUSE_ENDPOINT", ""), + "CLICKHOUSE_PASSWORD": os.getenv("CLICKHOUSE_PASSWORD", ""), + "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME,"), +} + logging.basicConfig(level=logging.INFO) _bucket_name = "ossci-raw-job-status" - _in_queue_job_select_statement = """ SELECT DATE_DIFF( @@ -66,14 +72,15 @@ def get_aws_s3_resource() -> Any: def get_clickhouse_client_environment() -> Any: - for env in ["CLICKHOUSE_ENDPOINT", "CLICKHOUSE_USERNAME", "CLICKHOUSE_PASSWORD"]: - if not os.getenv(env): - raise ValueError(f"Missing environment variable {env}") + info(f"Getting environment variables {ENVS}") + for name, env_val in ENVS.items(): + if not env_val: + raise ValueError(f"Missing environment variable {name}") return get_clickhouse_client( - host=os.getenv("CLICKHOUSE_ENDPOINT"), - user=os.getenv("CLICKHOUSE_USERNAME"), - password=os.getenv("CLICKHOUSE_PASSWORD"), + host=ENVS["CLICKHOUSE_ENDPOINT"], + user=ENVS["CLICKHOUSE_USERNAME"], + password=ENVS["CLICKHOUSE_PASSWORD"], ) @@ -381,6 +388,7 @@ def __init__( self, clickhouse_client: Any, s3_client: Any, + github_access_token: str = "", is_dry_run: bool = False, local_output: bool = False, ) -> None: @@ -389,17 +397,17 @@ def __init__( self.is_dry_run = is_dry_run self.local_output = local_output and is_dry_run - def process(self) -> None: - github_access_token = os.getenv("GITHUB_ACCESS_TOKEN", "") if not github_access_token: raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") + self.github_access_token = github_access_token + def process(self) -> None: # get runner config retrievers ( meta_runner_config_retriever, lf_runner_config_retriever, old_lf_lf_runner_config_retriever, - ) = get_config_retrievers(github_access_token) + ) = get_config_retrievers(self.github_access_token) # use current time as snapshot time timestamp = str(int(datetime.now().timestamp())) @@ -623,7 +631,9 @@ def lambda_handler(event: Any, context: Any) -> None: db_client = get_clickhouse_client_environment() s3_client = get_aws_s3_resource() - QueueTimeProcessor(db_client, s3_client).process() + QueueTimeProcessor( + db_client, s3_client, github_access_token=ENVS["GITHUB_ACCESS_TOKEN"] + ).process() return @@ -635,26 +645,26 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( "--clickhouse-endpoint", - default=os.getenv("CLICKHOUSE_ENDPOINT", ""), + default=ENVS["CLICKHOUSE_ENDPOINT"], type=str, help="the clickhouse endpoint, the clickhouse_endpoint name is https://{clickhouse_endpoint}:{port} for full url ", ) parser.add_argument( "--clickhouse-username", type=str, - default=os.getenv("CLICKHOUSE_USERNAME", ""), + default=ENVS["CLICKHOUSE_USERNAME"], help="the clickhouse username", ) parser.add_argument( "--clickhouse-password", type=str, - default=os.getenv("CLICKHOUSE_PASSWORD", ""), + default=ENVS["CLICKHOUSE_PASSWORD"], help="the clickhouse password for the user name", ) parser.add_argument( "--github-access-token", type=str, - default=os.getenv("GITHUB_ACCESS_TOKEN", ""), + default=ENVS["GITHUB_ACCESS_TOKEN"], help="the github access token to access github api", ) parser.add_argument( @@ -679,19 +689,23 @@ def main() -> None: arguments = parse_args() # update environment variables for input parameters - os.environ["CLICKHOUSE_ENDPOINT"] = arguments.clickhouse_endpoint - os.environ["CLICKHOUSE_USERNAME"] = arguments.clickhouse_username - os.environ["CLICKHOUSE_PASSWORD"] = arguments.clickhouse_password - os.environ["GITHUB_ACCESS_TOKEN"] = arguments.github_access_token - db_client = get_clickhouse_client_environment() + db_client = get_clickhouse_client( + host=arguments.clickhouse_endpoint, + user=arguments.clickhouse_username, + password=arguments.clickhouse_password, + ) s3_client = get_aws_s3_resource() # always run in dry-run mode in local environment, unless it's disabled. is_dry_run = not arguments.not_dry_run QueueTimeProcessor( - db_client, s3_client, is_dry_run=is_dry_run, local_output=arguments.local_output + db_client, + s3_client, + arguments.github_access_token, + is_dry_run=is_dry_run, + local_output=arguments.local_output, ).process() diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index 98711d35c0..25121ea7fc 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -138,11 +138,13 @@ def mock_db_client( ) -def set_default_env_variables(): - os.environ["CLICKHOUSE_ENDPOINT"] = "https://clickhouse.test1" - os.environ["CLICKHOUSE_USERNAME"] = "user1" - os.environ["CLICKHOUSE_PASSWORD"] = "pwd1" - os.environ["GITHUB_ACCESS_TOKEN"] = "token1" +def get_default_environment_variables(): + return { + "CLICKHOUSE_ENDPOINT": "test", + "CLICKHOUSE_USERNAME": "test", + "CLICKHOUSE_PASSWORD": "test", + "GITHUB_ACCESS_TOKEN": "test", + } class Test(unittest.TestCase): @@ -151,11 +153,16 @@ def setUp(self): patcher2 = patch("oss_ci_job_queue_time.lambda_function.get_clickhouse_client") patcher3 = patch("oss_ci_job_queue_time.lambda_function.get_runner_config") patcher4 = patch("oss_ci_job_queue_time.lambda_function.get_config_retrievers") + envs_patcher = patch( + "oss_ci_job_queue_time.lambda_function.ENVS", + new=get_default_environment_variables(), + ) self.mock_s3_resource = patcher1.start() self.mock_get_client = patcher2.start() self.mock_get_runner_config = patcher3.start() self.mock_get_config_retrievers = patcher4.start() + self.mock_envs = envs_patcher.start() self.mock_get_runner_config.return_value = {"runner_types": {}} self.mock_get_config_retrievers.return_value = ({}, {}, {}) @@ -164,11 +171,11 @@ def setUp(self): self.addCleanup(patcher2.stop) self.addCleanup(patcher3.stop) self.addCleanup(patcher4.stop) + self.addCleanup(envs_patcher.stop) def test_lambda_handler_when_row_result_is_empty(self): print("test_lambda_handler_when_row_result_is_empty ") # prepare - set_default_env_variables() mock_s3_resource_put(self.mock_s3_resource) mock_db_client(self.mock_get_client, [], []) @@ -183,7 +190,6 @@ def test_lambda_handler_when_row_result_is_empty(self): def test_lambda_handler_when_lambda_happy_flow_then_success(self): # prepare - set_default_env_variables() mock_s3_resource_put(self.mock_s3_resource) mock_db_client(self.mock_get_client) @@ -214,15 +220,12 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self): ("CLICKHOUSE_PASSWORD"), ("GITHUB_ACCESS_TOKEN"), ] - for x in test_cases: - with self.subTest(x=x): + with self.subTest(f"Test Environment {x}", x=x): # prepare self.mock_get_client.reset_mock(return_value=True) self.mock_s3_resource.reset_mock(return_value=True) - - set_default_env_variables() - os.environ[x] = "" + self.mock_envs[x] = "" # execute with self.assertRaises(ValueError) as context: @@ -235,11 +238,15 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self): self.mock_s3_resource ).return_value.put.assert_not_called() + + # reset + # manually reset the envs, todo: find a better way to do this,maybe use parameterized + self.mock_envs[x] = get_default_environment_variables()[x] + def test_local_run_with_dry_run_when_lambda_happy_flow_then_success_without_s3_write( self, ): # prepare - set_default_env_variables() mock_s3_resource_put(self.mock_s3_resource) mock_db_client(self.mock_get_client) From dcc68fd365e232565c35e71670893c84f221675a Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 15:59:09 -0700 Subject: [PATCH 37/38] comment --- .../oss_ci_job_queue_time/lambda_function.py | 59 ++++++++++--------- .../test_lambda_oss_ci_job_queue_time.py | 7 ++- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index 9e435e5ebb..e98898af52 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -5,6 +5,7 @@ import logging import os import gzip +import re import threading import yaml @@ -59,7 +60,7 @@ @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: # for local testing only, disable SSL verification - # return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) + #return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True @@ -353,7 +354,7 @@ def get_runner_config( return {"runner_types": {}} -def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: +def get_config_retrievers(github_access_token: str) -> Dict[str, LazyFileHistory]: auth = Auth.Token(github_access_token) test_infra_repo = Github(auth=auth).get_repo("pytorch/test-infra") pytorch_repo = Github(auth=auth).get_repo("pytorch/pytorch") @@ -368,11 +369,11 @@ def get_config_retrievers(github_access_token: str) -> Tuple[Any, Any, Any]: pytorch_repo, ".github/lf-scale-config.yml" ) - return ( - meta_runner_config_retriever, - lf_runner_config_retriever, - old_lf_lf_runner_config_retriever, - ) + return { + "meta": meta_runner_config_retriever, + "lf": lf_runner_config_retriever, + "old_lf": old_lf_lf_runner_config_retriever, + } class QueueTimeProcessor: @@ -403,24 +404,21 @@ def __init__( def process(self) -> None: # get runner config retrievers - ( - meta_runner_config_retriever, - lf_runner_config_retriever, - old_lf_lf_runner_config_retriever, - ) = get_config_retrievers(self.github_access_token) + retrievers = get_config_retrievers(self.github_access_token) # use current time as snapshot time timestamp = str(int(datetime.now().timestamp())) snapshot = self.get_queueing_jobs_snapshot( - meta_runner_config_retriever, - lf_runner_config_retriever, - old_lf_lf_runner_config_retriever, + retrievers["meta"], + retrievers["lf"], + retrievers["old_lf"], timestamp, "pytorch/pytorch", ) - self.output_snapshot(snapshot, timestamp) + if self.is_dry_run: + self.output_snapshot(snapshot, timestamp) # TODO(elainewy): add logics to generate histograms based on the snapshot results def output_snapshot( @@ -431,9 +429,6 @@ def output_snapshot( """ print the snapshot to local file or terminal for local test only """ - if not self.is_dry_run: - return - info( f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot" ) @@ -442,24 +437,24 @@ def output_snapshot( info(f"[Dry Run Mode]: local output to {file_name}.json") with open(file_name, "w") as f: f.write(json.dumps(snapshot)) + return info(json.dumps(snapshot)) - return def _fetch_snapshot_from_db( self, timestamp: str = "", repo: str = "pytorch/pytorch" ) -> List[Dict[str, Any]]: # in given snapshot time, fetches jobs that were in queue but not being picked up by workers - queued_query, queued_parameters = self.get_query_statement_for_queueing_jobs( - timestamp, repo + queued_query = self.get_query_statement_for_queueing_jobs(timestamp, repo) + jobs_in_queue = self._query_in_queue_jobs( + queued_query["query"], queued_query["parameters"] ) - jobs_in_queue = self._query_in_queue_jobs(queued_query, queued_parameters) # in given snapshot time, fetches jobs that were in queue but were picked up by workers later of given snapshot time # this happens when the snapshot time is not in latest timestamp - picked_query, picked_params = self.get_query_statement_for_picked_up_job( - timestamp, repo + picked_query = self.get_query_statement_for_picked_up_job(timestamp, repo) + jobs_pick = self._query_in_queue_jobs( + picked_query["query"], picked_query["parameters"] ) - jobs_pick = self._query_in_queue_jobs(picked_query, picked_params) datetime_str = datetime.fromtimestamp(int(timestamp)).strftime( "%Y-%m-%d %H:%M:%S" @@ -583,11 +578,14 @@ def get_query_statement_for_picked_up_job( "timestamp": time, "repo": repo, } - return query, parameters + return { + "query": query, + "parameters": parameters, + } def get_query_statement_for_queueing_jobs( self, time: str, repo: str = "pytorch/pytorch" - ): + ) -> Dict[str, Any]: """ this query is used to get jobs that werre in queue in given snapshot time, and not being picked up by workers """ @@ -621,7 +619,10 @@ def get_query_statement_for_queueing_jobs( "timestamp": time, "repo": repo, } - return query, parameters + return { + "query": query, + "parameters": parameters, + } def lambda_handler(event: Any, context: Any) -> None: diff --git a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py index 25121ea7fc..f55583f6fd 100644 --- a/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py +++ b/aws/lambda/tests/test_lambda_oss_ci_job_queue_time.py @@ -165,7 +165,11 @@ def setUp(self): self.mock_envs = envs_patcher.start() self.mock_get_runner_config.return_value = {"runner_types": {}} - self.mock_get_config_retrievers.return_value = ({}, {}, {}) + self.mock_get_config_retrievers.return_value = { + "meta": MagicMock(), + "lf": MagicMock(), + "old_lf": MagicMock(), + } self.addCleanup(patcher1.stop) # Ensure patchers stop after each test self.addCleanup(patcher2.stop) @@ -238,7 +242,6 @@ def test_lambda_handler_when_missing_required_env_vars_then_throws_error(self): self.mock_s3_resource ).return_value.put.assert_not_called() - # reset # manually reset the envs, todo: find a better way to do this,maybe use parameterized self.mock_envs[x] = get_default_environment_variables()[x] From f32924cf13e44137cf63c238485bfe6178fde84c Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 19 Mar 2025 16:11:41 -0700 Subject: [PATCH 38/38] comment --- .../oss_ci_job_queue_time/lambda_function.py | 56 +++++++++++++++++-- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/aws/lambda/oss_ci_job_queue_time/lambda_function.py b/aws/lambda/oss_ci_job_queue_time/lambda_function.py index e98898af52..a94d1ba537 100644 --- a/aws/lambda/oss_ci_job_queue_time/lambda_function.py +++ b/aws/lambda/oss_ci_job_queue_time/lambda_function.py @@ -60,7 +60,7 @@ @lru_cache() def get_clickhouse_client(host: str, user: str, password: str) -> Any: # for local testing only, disable SSL verification - #return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) + # return clickhouse_connect.get_client(host=host, user=user, password=password, secure=True, verify=False) return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True @@ -85,6 +85,32 @@ def get_clickhouse_client_environment() -> Any: ) +def write_to_file(data: Any, filename="", path=""): + """ + Writes data to a specified file. If no path is provided, writes to the current directory. + + :param data: The content to write to the file. + :param filename: The name of the file (default: 'output.txt'). + :param path: The directory where the file should be saved (default: current directory). + """ + + if not filename: + filename = "output_snapshot.json" + if not path: + path = "." + + # Ensure the path exists + os.makedirs(path, exist_ok=True) + + # Construct full file path + file_path = os.path.join(path, filename) + + # Write data to file + with open(file_path, "w", encoding="utf-8") as file: + file.write(data) + print(f"File written to: {os.path.abspath(file_path)}") + + def upload_to_s3_txt( s3_client: Any, bucket_name: str, @@ -392,12 +418,17 @@ def __init__( github_access_token: str = "", is_dry_run: bool = False, local_output: bool = False, + output_snapshot_file_name: str = "job_queue_times_snapshot", + output_snapshot_file_path: str = "", ) -> None: self.clickhouse_client = clickhouse_client self.s3_client = s3_client self.is_dry_run = is_dry_run self.local_output = local_output and is_dry_run + self.output_snapshot_file_name = output_snapshot_file_name + self.output_snapshot_file_path = output_snapshot_file_path + if not github_access_token: raise ValueError("Missing environment variable GITHUB_ACCESS_TOKEN") self.github_access_token = github_access_token @@ -433,10 +464,11 @@ def output_snapshot( f"[Dry Run Mode]: generated {len(snapshot)} records from get_jobs_in_queue_snapshot" ) if self.local_output: - file_name = f"job_queue_times_snapshot_{timestamp}.json" - info(f"[Dry Run Mode]: local output to {file_name}.json") - with open(file_name, "w") as f: - f.write(json.dumps(snapshot)) + write_to_file( + json.dumps(snapshot), + self.output_snapshot_file_name, + self.output_snapshot_file_path, + ) return info(json.dumps(snapshot)) @@ -678,6 +710,18 @@ def parse_args() -> argparse.Namespace: action="store_true", help="when set, writing results to s3 from local environment. By default, we run in dry-run mode for local environment", ) + parser.add_argument( + "--output-file-name", + type=str, + default="job_queue_times_snapshot.json", + help="the name of output file for local environment. this is only used for local test environment when local-output is enabled", + ) + parser.add_argument( + "--output-file-path", + type=str, + default="", + help="the path of output file for local environment. this is only used for local test environment when local-output is enabled", + ) args, _ = parser.parse_known_args() return args @@ -707,6 +751,8 @@ def main() -> None: arguments.github_access_token, is_dry_run=is_dry_run, local_output=arguments.local_output, + output_snapshot_file_name=arguments.output_file_name, + output_snapshot_file_path=arguments.output_file_path, ).process()