Skip to content

Commit a63df8c

Browse files
authored
eks example (#10)
1 parent 819a732 commit a63df8c

12 files changed

+434
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.terraform
2+
.terraform.lock.hcl
3+
terraform.tfstate
4+
terraform.tfstate.backup

examples/eks/README.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# An example of deploying Metaflow with a EKS cluster
2+
3+
This example will create Metaflow infrastructure from scratch, with a Kubernetes cluster using Amazon EKS. It uses [`datastore`](../../modules/datastore/) and [`metadata-service`](../../modules/metadata-service/) submodules to provision S3 bucket, RDS database and Metaflow Metadata service running on AWS Fargate.
4+
5+
To run Metaflow jobs, it provisions a EKS cluster using [this popular open source terraform module](https://registry.terraform.io/modules/terraform-aws-modules/eks/aws/latest). In that cluster, it also installs [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) and [Argo Workflows](https://argoproj.github.io/argo-workflows/) using Helm.
6+
7+
Specifically, it'll create following resources in your AWS account:
8+
* General networking infra:
9+
* AWS VPC
10+
* NAT gateway for private subnets in the VPC
11+
* For storing data artifacts:
12+
* S3 bucket
13+
* For Metaflow metadata:
14+
* RDS Database instance (on-demand, Multi-AZ, db.t2.small)
15+
* ECS service for Metaflow Metadata service
16+
* Network load balancer
17+
* API Gateway
18+
* For executing Metaflow tasks:
19+
* Autoscaling EKS cluster with at least one instance running
20+
21+
Note that all this infrastructure costs a non-trivial amount at rest, up to $400/month and more if being actively used.
22+
23+
## Instructions
24+
25+
0. Run `terraform init`
26+
1. Run `terraform apply` to create infrastructure. This command will typically take ~20 minutes to execute.
27+
2. Make note of the EKS cluster name (it is a short string that starts with `mf-`). Use AWS CLI to generate cluster configuration:
28+
```bash
29+
aws eks update-kubeconfig --name <CLUSTER NAME>
30+
```
31+
2. Copy `config.json` to `~/.metaflowconfig/`
32+
3. You should be ready to run Metaflow flows using `@kubernetes`
33+
and be able to deploy them to Argo workflows.
34+
35+
Argo Workflows UI is not accessible from outside the cluster, but you can use port forwarding to see it. Run
36+
```bash
37+
kubectl port-forward -n argo service/argo-argo-workflows-server 2746:2746
38+
```
39+
..and you should be able to access it at `localhost:2746`.
40+
41+
## Destroying the infrastructure
42+
43+
Run `terraform destroy`
44+
45+
# What's missing
46+
47+
⚠️ This is meant as a reference example, with many things omitted for simplicity, such as proper RBAC setup, production-grade autoscaling and UI. For example, all workloads running in the cluster use the same AWS IAM role. We do not recommend using this as a production deployment of Metaflow on Kubernetes.
48+
49+
For learn more about production-grade deployments, you can talk to us on [the Outerbounds slack](http://slack.outerbounds.co). We are happy to help you there!

examples/eks/eks.tf

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
2+
module "eks" {
3+
source = "terraform-aws-modules/eks/aws"
4+
version = "17.23.0"
5+
6+
cluster_name = local.cluster_name
7+
cluster_version = "1.21"
8+
subnets = module.vpc.private_subnets
9+
enable_irsa = true
10+
tags = local.tags
11+
12+
vpc_id = module.vpc.vpc_id
13+
14+
node_groups_defaults = {
15+
ami_type = "AL2_x86_64"
16+
disk_size = 50
17+
}
18+
19+
20+
node_groups = {
21+
main = {
22+
desired_capacity = 1
23+
max_capacity = 5
24+
min_capacity = 1
25+
26+
instance_types = ["r5.large"]
27+
update_config = {
28+
max_unavailable_percentage = 50
29+
}
30+
}
31+
}
32+
33+
workers_additional_policies = [
34+
aws_iam_policy.default_node.arn,
35+
aws_iam_policy.cluster_autoscaler.arn,
36+
]
37+
}
38+
39+
40+
resource "aws_iam_policy" "default_node" {
41+
name_prefix = "${local.cluster_name}-default"
42+
description = "Default policy for cluster ${module.eks.cluster_id}"
43+
policy = data.aws_iam_policy_document.default_node.json
44+
}
45+
46+
data "aws_iam_policy_document" "default_node" {
47+
statement {
48+
sid = "S3"
49+
effect = "Allow"
50+
51+
actions = [
52+
"s3:*",
53+
"kms:*",
54+
]
55+
56+
resources = ["*"]
57+
}
58+
}
59+
60+
resource "aws_iam_policy" "cluster_autoscaler" {
61+
name_prefix = "cluster-autoscaler"
62+
description = "EKS cluster-autoscaler policy for cluster ${module.eks.cluster_id}"
63+
policy = data.aws_iam_policy_document.cluster_autoscaler.json
64+
}
65+
66+
data "aws_iam_policy_document" "cluster_autoscaler" {
67+
statement {
68+
sid = "clusterAutoscalerAll"
69+
effect = "Allow"
70+
71+
actions = [
72+
"autoscaling:DescribeAutoScalingGroups",
73+
"autoscaling:DescribeAutoScalingInstances",
74+
"autoscaling:DescribeLaunchConfigurations",
75+
"autoscaling:DescribeTags",
76+
"ec2:DescribeLaunchTemplateVersions",
77+
]
78+
79+
resources = ["*"]
80+
}
81+
82+
statement {
83+
sid = "clusterAutoscalerOwn"
84+
effect = "Allow"
85+
86+
actions = [
87+
"autoscaling:SetDesiredCapacity",
88+
"autoscaling:TerminateInstanceInAutoScalingGroup",
89+
"autoscaling:UpdateAutoScalingGroup",
90+
]
91+
92+
resources = ["*"]
93+
94+
condition {
95+
test = "StringEquals"
96+
variable = "autoscaling:ResourceTag/kubernetes.io/cluster/${module.eks.cluster_id}"
97+
values = ["owned"]
98+
}
99+
100+
condition {
101+
test = "StringEquals"
102+
variable = "autoscaling:ResourceTag/k8s.io/cluster-autoscaler/enabled"
103+
values = ["true"]
104+
}
105+
}
106+
}
107+
108+
109+
data "aws_eks_cluster" "cluster" {
110+
name = module.eks.cluster_id
111+
}
112+
113+
data "aws_eks_cluster_auth" "cluster" {
114+
name = module.eks.cluster_id
115+
}
116+
117+
data "aws_caller_identity" "current" {}
118+
119+
provider "kubernetes" {
120+
host = data.aws_eks_cluster.cluster.endpoint
121+
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
122+
token = data.aws_eks_cluster_auth.cluster.token
123+
}

examples/eks/helm.tf

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
provider "helm" {
2+
kubernetes {
3+
host = data.aws_eks_cluster.cluster.endpoint
4+
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
5+
token = data.aws_eks_cluster_auth.cluster.token
6+
}
7+
}
8+
9+
resource "helm_release" "cluster_autoscaler" {
10+
name = "autoscaler"
11+
12+
depends_on = [module.eks]
13+
14+
repository = "https://kubernetes.github.io/autoscaler"
15+
chart = "cluster-autoscaler"
16+
namespace = "kube-system"
17+
18+
set {
19+
name = "autoDiscovery.clusterName"
20+
value = local.cluster_name
21+
}
22+
23+
set {
24+
name = "awsRegion"
25+
value = data.aws_region.current.name
26+
}
27+
}
28+
29+
30+
resource "kubernetes_namespace" "argo" {
31+
metadata {
32+
name = "argo"
33+
}
34+
}
35+
36+
resource "kubernetes_default_service_account" "default" {
37+
metadata {
38+
namespace = kubernetes_namespace.argo.metadata[0].name
39+
}
40+
}
41+
42+
data "aws_region" "current" {}
43+
44+
locals {
45+
argo_values = {
46+
"server" = {
47+
"extraArgs" = ["--auth-mode=server"]
48+
}
49+
"workflow" = {
50+
"serviceAccount" = {
51+
"create" = true
52+
}
53+
}
54+
"controller" = {
55+
"containerRuntimeExecutor" = "emissary"
56+
}
57+
"useDefaultArtifactRepo" = true
58+
"useStaticCredentials" = false
59+
"artifactRepository" = {
60+
"s3" = {
61+
"bucket" = module.metaflow-datastore.s3_bucket_name
62+
"keyFormat" = "argo-artifacts/{{workflow.creationTimestamp.Y}}/{{workflow.creationTimestamp.m}}/{{workflow.creationTimestamp.d}}/{{workflow.name}}/{{pod.name}}"
63+
"region" = data.aws_region.current.name
64+
"endpoint" = "s3.amazonaws.com"
65+
"useSDKCreds" = true
66+
"insecure" = false
67+
}
68+
}
69+
}
70+
}
71+
72+
resource "helm_release" "argo" {
73+
name = "argo"
74+
75+
depends_on = [module.eks]
76+
77+
repository = "https://argoproj.github.io/argo-helm"
78+
chart = "argo-workflows"
79+
namespace = kubernetes_namespace.argo.metadata[0].name
80+
force_update = true
81+
82+
values = [
83+
yamlencode(local.argo_values)
84+
]
85+
}

examples/eks/iam-ecs-execution.tf

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
data "aws_iam_policy_document" "ecs_execution_role_assume_role" {
2+
statement {
3+
actions = [
4+
"sts:AssumeRole"
5+
]
6+
7+
effect = "Allow"
8+
9+
principals {
10+
identifiers = [
11+
"ec2.amazonaws.com",
12+
"ecs.amazonaws.com",
13+
"ecs-tasks.amazonaws.com",
14+
"batch.amazonaws.com"
15+
]
16+
type = "Service"
17+
}
18+
}
19+
}
20+
21+
resource "aws_iam_role" "ecs_execution_role" {
22+
name = "${local.resource_prefix}ecs-execution-role${local.resource_suffix}"
23+
# Read more about ECS' `task_role` and `execution_role` here https://stackoverflow.com/a/49947471
24+
description = "This role is passed to our AWS ECS' task definition as the `execution_role`. This allows things like the correct image to be pulled and logs to be stored."
25+
assume_role_policy = data.aws_iam_policy_document.ecs_execution_role_assume_role.json
26+
27+
tags = local.tags
28+
}
29+
30+
data "aws_iam_policy_document" "ecs_task_execution_policy" {
31+
statement {
32+
effect = "Allow"
33+
34+
actions = [
35+
"ecr:GetAuthorizationToken",
36+
"ecr:BatchCheckLayerAvailability",
37+
"ecr:GetDownloadUrlForLayer",
38+
"ecr:BatchGetImage",
39+
"logs:CreateLogStream",
40+
"logs:PutLogEvents"
41+
]
42+
43+
# The `"Resource": "*"` is not a concern and the policy that Amazon suggests using
44+
# https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html
45+
resources = [
46+
"*"
47+
]
48+
}
49+
}
50+
51+
resource "aws_iam_role_policy" "grant_ecs_access" {
52+
name = "ecs_access"
53+
role = aws_iam_role.ecs_execution_role.name
54+
policy = data.aws_iam_policy_document.ecs_task_execution_policy.json
55+
}

examples/eks/metaflow.tf

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
resource "random_string" "suffix" {
2+
length = 8
3+
special = false
4+
upper = false
5+
}
6+
7+
locals {
8+
resource_prefix = "metaflow"
9+
resource_suffix = random_string.suffix.result
10+
tags = {
11+
"managedBy" = "terraform"
12+
"application" = "metaflow-eks-example"
13+
}
14+
cluster_name = "mf-${local.resource_suffix}"
15+
}
16+
17+
data "aws_availability_zones" "available" {
18+
}
19+
20+
21+
module "metaflow-datastore" {
22+
source = "outerbounds/metaflow/aws//modules/datastore"
23+
version = "0.3.1"
24+
25+
resource_prefix = local.resource_prefix
26+
resource_suffix = local.resource_suffix
27+
28+
metadata_service_security_group_id = module.metaflow-metadata-service.metadata_service_security_group_id
29+
metaflow_vpc_id = module.vpc.vpc_id
30+
subnet1_id = module.vpc.private_subnets[0]
31+
subnet2_id = module.vpc.private_subnets[1]
32+
33+
standard_tags = local.tags
34+
}
35+
36+
module "metaflow-common" {
37+
source = "outerbounds/metaflow/aws//modules/common"
38+
version = "0.3.1"
39+
}
40+
41+
module "metaflow-metadata-service" {
42+
# source = "outerbounds/metaflow/aws//modules/metadata-service"
43+
# version = "0.3.1"
44+
source = "../../modules/metadata-service"
45+
46+
resource_prefix = local.resource_prefix
47+
resource_suffix = local.resource_suffix
48+
49+
access_list_cidr_blocks = []
50+
api_basic_auth = true
51+
database_password = module.metaflow-datastore.database_password
52+
database_username = module.metaflow-datastore.database_username
53+
datastore_s3_bucket_kms_key_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn
54+
fargate_execution_role_arn = aws_iam_role.ecs_execution_role.arn
55+
metaflow_vpc_id = module.vpc.vpc_id
56+
metadata_service_container_image = module.metaflow-common.default_metadata_service_container_image
57+
rds_master_instance_endpoint = module.metaflow-datastore.rds_master_instance_endpoint
58+
s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn
59+
subnet1_id = module.vpc.private_subnets[0]
60+
subnet2_id = module.vpc.private_subnets[1]
61+
vpc_cidr_block = module.vpc.vpc_cidr_block
62+
63+
standard_tags = local.tags
64+
}

0 commit comments

Comments
 (0)