published on Thursday, Apr 30, 2026 by Pulumi
published on Thursday, Apr 30, 2026 by Pulumi
Manages an AWS SageMaker AI Training Job.
Example Usage
Basic Usage
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.sagemaker.TrainingJob("example", {
trainingJobName: "example",
roleArn: exampleAwsIamRole.arn,
algorithmSpecification: {
trainingInputMode: "File",
trainingImage: exampleAwsSagemakerPrebuiltEcrImage.registryPath,
},
outputDataConfig: {
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/output/`,
},
resourceConfig: {
instanceType: "ml.m5.large",
instanceCount: 1,
volumeSizeInGb: 30,
},
stoppingCondition: {
maxRuntimeInSeconds: 3600,
},
});
import pulumi
import pulumi_aws as aws
example = aws.sagemaker.TrainingJob("example",
training_job_name="example",
role_arn=example_aws_iam_role["arn"],
algorithm_specification={
"training_input_mode": "File",
"training_image": example_aws_sagemaker_prebuilt_ecr_image["registryPath"],
},
output_data_config={
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/output/",
},
resource_config={
"instance_type": "ml.m5.large",
"instance_count": 1,
"volume_size_in_gb": 30,
},
stopping_condition={
"max_runtime_in_seconds": 3600,
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/sagemaker"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := sagemaker.NewTrainingJob(ctx, "example", &sagemaker.TrainingJobArgs{
TrainingJobName: pulumi.String("example"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
TrainingInputMode: pulumi.String("File"),
TrainingImage: pulumi.Any(exampleAwsSagemakerPrebuiltEcrImage.RegistryPath),
},
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
S3OutputPath: pulumi.Sprintf("s3://%v/output/", exampleAwsS3Bucket.Bucket),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceType: pulumi.String("ml.m5.large"),
InstanceCount: pulumi.Int(1),
VolumeSizeInGb: pulumi.Int(30),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxRuntimeInSeconds: pulumi.Int(3600),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Sagemaker.TrainingJob("example", new()
{
TrainingJobName = "example",
RoleArn = exampleAwsIamRole.Arn,
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
TrainingInputMode = "File",
TrainingImage = exampleAwsSagemakerPrebuiltEcrImage.RegistryPath,
},
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/output/",
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceType = "ml.m5.large",
InstanceCount = 1,
VolumeSizeInGb = 30,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxRuntimeInSeconds = 3600,
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.sagemaker.TrainingJob;
import com.pulumi.aws.sagemaker.TrainingJobArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobAlgorithmSpecificationArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobOutputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobResourceConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobStoppingConditionArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new TrainingJob("example", TrainingJobArgs.builder()
.trainingJobName("example")
.roleArn(exampleAwsIamRole.arn())
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.trainingInputMode("File")
.trainingImage(exampleAwsSagemakerPrebuiltEcrImage.registryPath())
.build())
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.s3OutputPath(String.format("s3://%s/output/", exampleAwsS3Bucket.bucket()))
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceType("ml.m5.large")
.instanceCount(1)
.volumeSizeInGb(30)
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxRuntimeInSeconds(3600)
.build())
.build());
}
}
resources:
example:
type: aws:sagemaker:TrainingJob
properties:
trainingJobName: example
roleArn: ${exampleAwsIamRole.arn}
algorithmSpecification:
trainingInputMode: File
trainingImage: ${exampleAwsSagemakerPrebuiltEcrImage.registryPath}
outputDataConfig:
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/output/
resourceConfig:
instanceType: ml.m5.large
instanceCount: 1
volumeSizeInGb: 30
stoppingCondition:
maxRuntimeInSeconds: 3600
With VPC Configuration
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.sagemaker.TrainingJob("example", {
trainingJobName: "example",
roleArn: exampleAwsIamRole.arn,
algorithmSpecification: {
trainingInputMode: "File",
trainingImage: exampleAwsSagemakerPrebuiltEcrImage.registryPath,
},
outputDataConfig: {
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/output/`,
},
resourceConfig: {
instanceType: "ml.m5.large",
instanceCount: 1,
volumeSizeInGb: 30,
},
stoppingCondition: {
maxRuntimeInSeconds: 3600,
},
vpcConfig: {
securityGroupIds: [exampleAwsSecurityGroup.id],
subnets: [exampleAwsSubnet.id],
},
});
import pulumi
import pulumi_aws as aws
example = aws.sagemaker.TrainingJob("example",
training_job_name="example",
role_arn=example_aws_iam_role["arn"],
algorithm_specification={
"training_input_mode": "File",
"training_image": example_aws_sagemaker_prebuilt_ecr_image["registryPath"],
},
output_data_config={
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/output/",
},
resource_config={
"instance_type": "ml.m5.large",
"instance_count": 1,
"volume_size_in_gb": 30,
},
stopping_condition={
"max_runtime_in_seconds": 3600,
},
vpc_config={
"security_group_ids": [example_aws_security_group["id"]],
"subnets": [example_aws_subnet["id"]],
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/sagemaker"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := sagemaker.NewTrainingJob(ctx, "example", &sagemaker.TrainingJobArgs{
TrainingJobName: pulumi.String("example"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
TrainingInputMode: pulumi.String("File"),
TrainingImage: pulumi.Any(exampleAwsSagemakerPrebuiltEcrImage.RegistryPath),
},
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
S3OutputPath: pulumi.Sprintf("s3://%v/output/", exampleAwsS3Bucket.Bucket),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceType: pulumi.String("ml.m5.large"),
InstanceCount: pulumi.Int(1),
VolumeSizeInGb: pulumi.Int(30),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxRuntimeInSeconds: pulumi.Int(3600),
},
VpcConfig: &sagemaker.TrainingJobVpcConfigArgs{
SecurityGroupIds: pulumi.StringArray{
exampleAwsSecurityGroup.Id,
},
Subnets: pulumi.StringArray{
exampleAwsSubnet.Id,
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Sagemaker.TrainingJob("example", new()
{
TrainingJobName = "example",
RoleArn = exampleAwsIamRole.Arn,
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
TrainingInputMode = "File",
TrainingImage = exampleAwsSagemakerPrebuiltEcrImage.RegistryPath,
},
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/output/",
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceType = "ml.m5.large",
InstanceCount = 1,
VolumeSizeInGb = 30,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxRuntimeInSeconds = 3600,
},
VpcConfig = new Aws.Sagemaker.Inputs.TrainingJobVpcConfigArgs
{
SecurityGroupIds = new[]
{
exampleAwsSecurityGroup.Id,
},
Subnets = new[]
{
exampleAwsSubnet.Id,
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.sagemaker.TrainingJob;
import com.pulumi.aws.sagemaker.TrainingJobArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobAlgorithmSpecificationArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobOutputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobResourceConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobStoppingConditionArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobVpcConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new TrainingJob("example", TrainingJobArgs.builder()
.trainingJobName("example")
.roleArn(exampleAwsIamRole.arn())
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.trainingInputMode("File")
.trainingImage(exampleAwsSagemakerPrebuiltEcrImage.registryPath())
.build())
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.s3OutputPath(String.format("s3://%s/output/", exampleAwsS3Bucket.bucket()))
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceType("ml.m5.large")
.instanceCount(1)
.volumeSizeInGb(30)
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxRuntimeInSeconds(3600)
.build())
.vpcConfig(TrainingJobVpcConfigArgs.builder()
.securityGroupIds(exampleAwsSecurityGroup.id())
.subnets(exampleAwsSubnet.id())
.build())
.build());
}
}
resources:
example:
type: aws:sagemaker:TrainingJob
properties:
trainingJobName: example
roleArn: ${exampleAwsIamRole.arn}
algorithmSpecification:
trainingInputMode: File
trainingImage: ${exampleAwsSagemakerPrebuiltEcrImage.registryPath}
outputDataConfig:
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/output/
resourceConfig:
instanceType: ml.m5.large
instanceCount: 1
volumeSizeInGb: 30
stoppingCondition:
maxRuntimeInSeconds: 3600
vpcConfig:
securityGroupIds:
- ${exampleAwsSecurityGroup.id}
subnets:
- ${exampleAwsSubnet.id}
With Input Data and Hyperparameters
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.sagemaker.TrainingJob("example", {
trainingJobName: "example",
roleArn: exampleAwsIamRole.arn,
algorithmSpecification: {
trainingInputMode: "File",
trainingImage: exampleAwsSagemakerPrebuiltEcrImage.registryPath,
enableSagemakerMetricsTimeSeries: true,
},
hyperParameters: {
mini_batch_size: "200",
epochs: "10",
},
inputDataConfigs: [{
channelName: "train",
dataSource: {
s3DataSource: {
s3DataType: "S3Prefix",
s3Uri: `s3://${exampleAwsS3Bucket.bucket}/train/`,
},
},
}],
outputDataConfig: {
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/output/`,
},
resourceConfig: {
instanceType: "ml.m5.large",
instanceCount: 1,
volumeSizeInGb: 30,
},
stoppingCondition: {
maxRuntimeInSeconds: 3600,
},
});
import pulumi
import pulumi_aws as aws
example = aws.sagemaker.TrainingJob("example",
training_job_name="example",
role_arn=example_aws_iam_role["arn"],
algorithm_specification={
"training_input_mode": "File",
"training_image": example_aws_sagemaker_prebuilt_ecr_image["registryPath"],
"enable_sagemaker_metrics_time_series": True,
},
hyper_parameters={
"mini_batch_size": "200",
"epochs": "10",
},
input_data_configs=[{
"channel_name": "train",
"data_source": {
"s3_data_source": {
"s3_data_type": "S3Prefix",
"s3_uri": f"s3://{example_aws_s3_bucket['bucket']}/train/",
},
},
}],
output_data_config={
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/output/",
},
resource_config={
"instance_type": "ml.m5.large",
"instance_count": 1,
"volume_size_in_gb": 30,
},
stopping_condition={
"max_runtime_in_seconds": 3600,
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/sagemaker"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := sagemaker.NewTrainingJob(ctx, "example", &sagemaker.TrainingJobArgs{
TrainingJobName: pulumi.String("example"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
TrainingInputMode: pulumi.String("File"),
TrainingImage: pulumi.Any(exampleAwsSagemakerPrebuiltEcrImage.RegistryPath),
EnableSagemakerMetricsTimeSeries: pulumi.Bool(true),
},
HyperParameters: pulumi.StringMap{
"mini_batch_size": pulumi.String("200"),
"epochs": pulumi.String("10"),
},
InputDataConfigs: sagemaker.TrainingJobInputDataConfigArray{
&sagemaker.TrainingJobInputDataConfigArgs{
ChannelName: pulumi.String("train"),
DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceArgs{
S3DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceS3DataSourceArgs{
S3DataType: pulumi.String("S3Prefix"),
S3Uri: pulumi.Sprintf("s3://%v/train/", exampleAwsS3Bucket.Bucket),
},
},
},
},
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
S3OutputPath: pulumi.Sprintf("s3://%v/output/", exampleAwsS3Bucket.Bucket),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceType: pulumi.String("ml.m5.large"),
InstanceCount: pulumi.Int(1),
VolumeSizeInGb: pulumi.Int(30),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxRuntimeInSeconds: pulumi.Int(3600),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Sagemaker.TrainingJob("example", new()
{
TrainingJobName = "example",
RoleArn = exampleAwsIamRole.Arn,
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
TrainingInputMode = "File",
TrainingImage = exampleAwsSagemakerPrebuiltEcrImage.RegistryPath,
EnableSagemakerMetricsTimeSeries = true,
},
HyperParameters =
{
{ "mini_batch_size", "200" },
{ "epochs", "10" },
},
InputDataConfigs = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigArgs
{
ChannelName = "train",
DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceArgs
{
S3DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceS3DataSourceArgs
{
S3DataType = "S3Prefix",
S3Uri = $"s3://{exampleAwsS3Bucket.Bucket}/train/",
},
},
},
},
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/output/",
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceType = "ml.m5.large",
InstanceCount = 1,
VolumeSizeInGb = 30,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxRuntimeInSeconds = 3600,
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.sagemaker.TrainingJob;
import com.pulumi.aws.sagemaker.TrainingJobArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobAlgorithmSpecificationArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInputDataConfigDataSourceArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInputDataConfigDataSourceS3DataSourceArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobOutputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobResourceConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobStoppingConditionArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new TrainingJob("example", TrainingJobArgs.builder()
.trainingJobName("example")
.roleArn(exampleAwsIamRole.arn())
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.trainingInputMode("File")
.trainingImage(exampleAwsSagemakerPrebuiltEcrImage.registryPath())
.enableSagemakerMetricsTimeSeries(true)
.build())
.hyperParameters(Map.ofEntries(
Map.entry("mini_batch_size", "200"),
Map.entry("epochs", "10")
))
.inputDataConfigs(TrainingJobInputDataConfigArgs.builder()
.channelName("train")
.dataSource(TrainingJobInputDataConfigDataSourceArgs.builder()
.s3DataSource(TrainingJobInputDataConfigDataSourceS3DataSourceArgs.builder()
.s3DataType("S3Prefix")
.s3Uri(String.format("s3://%s/train/", exampleAwsS3Bucket.bucket()))
.build())
.build())
.build())
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.s3OutputPath(String.format("s3://%s/output/", exampleAwsS3Bucket.bucket()))
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceType("ml.m5.large")
.instanceCount(1)
.volumeSizeInGb(30)
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxRuntimeInSeconds(3600)
.build())
.build());
}
}
resources:
example:
type: aws:sagemaker:TrainingJob
properties:
trainingJobName: example
roleArn: ${exampleAwsIamRole.arn}
algorithmSpecification:
trainingInputMode: File
trainingImage: ${exampleAwsSagemakerPrebuiltEcrImage.registryPath}
enableSagemakerMetricsTimeSeries: true
hyperParameters:
mini_batch_size: '200'
epochs: '10'
inputDataConfigs:
- channelName: train
dataSource:
s3DataSource:
s3DataType: S3Prefix
s3Uri: s3://${exampleAwsS3Bucket.bucket}/train/
outputDataConfig:
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/output/
resourceConfig:
instanceType: ml.m5.large
instanceCount: 1
volumeSizeInGb: 30
stoppingCondition:
maxRuntimeInSeconds: 3600
With Encrypted Output, Checkpoints, and TensorBoard
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.sagemaker.TrainingJob("example", {
trainingJobName: "example",
roleArn: exampleAwsIamRole.arn,
algorithmSpecification: {
trainingInputMode: "File",
trainingImage: exampleAwsSagemakerPrebuiltEcrImage.registryPath,
},
checkpointConfig: {
localPath: "/opt/ml/checkpoints",
s3Uri: `s3://${exampleAwsS3Bucket.bucket}/checkpoints/`,
},
outputDataConfig: {
compressionType: "GZIP",
kmsKeyId: exampleAwsKmsKey.arn,
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/output/`,
},
resourceConfig: {
instanceType: "ml.m5.large",
instanceCount: 1,
volumeSizeInGb: 30,
volumeKmsKeyId: exampleAwsKmsKey.arn,
},
stoppingCondition: {
maxRuntimeInSeconds: 3600,
},
tensorBoardOutputConfig: {
localPath: "/opt/ml/output/tensorboard",
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/tensorboard/`,
},
});
import pulumi
import pulumi_aws as aws
example = aws.sagemaker.TrainingJob("example",
training_job_name="example",
role_arn=example_aws_iam_role["arn"],
algorithm_specification={
"training_input_mode": "File",
"training_image": example_aws_sagemaker_prebuilt_ecr_image["registryPath"],
},
checkpoint_config={
"local_path": "/opt/ml/checkpoints",
"s3_uri": f"s3://{example_aws_s3_bucket['bucket']}/checkpoints/",
},
output_data_config={
"compression_type": "GZIP",
"kms_key_id": example_aws_kms_key["arn"],
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/output/",
},
resource_config={
"instance_type": "ml.m5.large",
"instance_count": 1,
"volume_size_in_gb": 30,
"volume_kms_key_id": example_aws_kms_key["arn"],
},
stopping_condition={
"max_runtime_in_seconds": 3600,
},
tensor_board_output_config={
"local_path": "/opt/ml/output/tensorboard",
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/tensorboard/",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/sagemaker"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := sagemaker.NewTrainingJob(ctx, "example", &sagemaker.TrainingJobArgs{
TrainingJobName: pulumi.String("example"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
TrainingInputMode: pulumi.String("File"),
TrainingImage: pulumi.Any(exampleAwsSagemakerPrebuiltEcrImage.RegistryPath),
},
CheckpointConfig: &sagemaker.TrainingJobCheckpointConfigArgs{
LocalPath: pulumi.String("/opt/ml/checkpoints"),
S3Uri: pulumi.Sprintf("s3://%v/checkpoints/", exampleAwsS3Bucket.Bucket),
},
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
CompressionType: pulumi.String("GZIP"),
KmsKeyId: pulumi.Any(exampleAwsKmsKey.Arn),
S3OutputPath: pulumi.Sprintf("s3://%v/output/", exampleAwsS3Bucket.Bucket),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceType: pulumi.String("ml.m5.large"),
InstanceCount: pulumi.Int(1),
VolumeSizeInGb: pulumi.Int(30),
VolumeKmsKeyId: pulumi.Any(exampleAwsKmsKey.Arn),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxRuntimeInSeconds: pulumi.Int(3600),
},
TensorBoardOutputConfig: &sagemaker.TrainingJobTensorBoardOutputConfigArgs{
LocalPath: pulumi.String("/opt/ml/output/tensorboard"),
S3OutputPath: pulumi.Sprintf("s3://%v/tensorboard/", exampleAwsS3Bucket.Bucket),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Sagemaker.TrainingJob("example", new()
{
TrainingJobName = "example",
RoleArn = exampleAwsIamRole.Arn,
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
TrainingInputMode = "File",
TrainingImage = exampleAwsSagemakerPrebuiltEcrImage.RegistryPath,
},
CheckpointConfig = new Aws.Sagemaker.Inputs.TrainingJobCheckpointConfigArgs
{
LocalPath = "/opt/ml/checkpoints",
S3Uri = $"s3://{exampleAwsS3Bucket.Bucket}/checkpoints/",
},
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
CompressionType = "GZIP",
KmsKeyId = exampleAwsKmsKey.Arn,
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/output/",
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceType = "ml.m5.large",
InstanceCount = 1,
VolumeSizeInGb = 30,
VolumeKmsKeyId = exampleAwsKmsKey.Arn,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxRuntimeInSeconds = 3600,
},
TensorBoardOutputConfig = new Aws.Sagemaker.Inputs.TrainingJobTensorBoardOutputConfigArgs
{
LocalPath = "/opt/ml/output/tensorboard",
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/tensorboard/",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.sagemaker.TrainingJob;
import com.pulumi.aws.sagemaker.TrainingJobArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobAlgorithmSpecificationArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobCheckpointConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobOutputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobResourceConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobStoppingConditionArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobTensorBoardOutputConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new TrainingJob("example", TrainingJobArgs.builder()
.trainingJobName("example")
.roleArn(exampleAwsIamRole.arn())
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.trainingInputMode("File")
.trainingImage(exampleAwsSagemakerPrebuiltEcrImage.registryPath())
.build())
.checkpointConfig(TrainingJobCheckpointConfigArgs.builder()
.localPath("/opt/ml/checkpoints")
.s3Uri(String.format("s3://%s/checkpoints/", exampleAwsS3Bucket.bucket()))
.build())
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.compressionType("GZIP")
.kmsKeyId(exampleAwsKmsKey.arn())
.s3OutputPath(String.format("s3://%s/output/", exampleAwsS3Bucket.bucket()))
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceType("ml.m5.large")
.instanceCount(1)
.volumeSizeInGb(30)
.volumeKmsKeyId(exampleAwsKmsKey.arn())
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxRuntimeInSeconds(3600)
.build())
.tensorBoardOutputConfig(TrainingJobTensorBoardOutputConfigArgs.builder()
.localPath("/opt/ml/output/tensorboard")
.s3OutputPath(String.format("s3://%s/tensorboard/", exampleAwsS3Bucket.bucket()))
.build())
.build());
}
}
resources:
example:
type: aws:sagemaker:TrainingJob
properties:
trainingJobName: example
roleArn: ${exampleAwsIamRole.arn}
algorithmSpecification:
trainingInputMode: File
trainingImage: ${exampleAwsSagemakerPrebuiltEcrImage.registryPath}
checkpointConfig:
localPath: /opt/ml/checkpoints
s3Uri: s3://${exampleAwsS3Bucket.bucket}/checkpoints/
outputDataConfig:
compressionType: GZIP
kmsKeyId: ${exampleAwsKmsKey.arn}
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/output/
resourceConfig:
instanceType: ml.m5.large
instanceCount: 1
volumeSizeInGb: 30
volumeKmsKeyId: ${exampleAwsKmsKey.arn}
stoppingCondition:
maxRuntimeInSeconds: 3600
tensorBoardOutputConfig:
localPath: /opt/ml/output/tensorboard
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/tensorboard/
With Managed Spot Training and Custom Metrics
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.sagemaker.TrainingJob("example", {
trainingJobName: "example",
roleArn: exampleAwsIamRole.arn,
enableManagedSpotTraining: true,
enableNetworkIsolation: true,
enableInterContainerTrafficEncryption: true,
algorithmSpecification: {
trainingInputMode: "File",
trainingImage: trainingImage,
containerEntrypoints: [
"python",
"/opt/ml/code/train.py",
],
containerArguments: [
"--epochs",
"10",
"--batch-size",
"128",
],
metricDefinitions: [
{
name: "train:loss",
regex: "loss: ([0-9\\.]+)",
},
{
name: "validation:accuracy",
regex: "accuracy: ([0-9\\.]+)",
},
],
},
environment: {
MODEL_DIR: "/opt/ml/model",
SM_LOG_LEVEL: "20",
},
hyperParameters: {
epochs: "10",
batch_size: "128",
},
outputDataConfig: {
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/output/`,
},
resourceConfig: {
instanceType: "ml.m5.xlarge",
instanceCount: 1,
volumeSizeInGb: 50,
keepAlivePeriodInSeconds: 600,
},
retryStrategy: {
maximumRetryAttempts: 3,
},
stoppingCondition: {
maxRuntimeInSeconds: 3600,
maxWaitTimeInSeconds: 7200,
},
tags: {
Environment: "test",
Workload: "training",
},
});
import pulumi
import pulumi_aws as aws
example = aws.sagemaker.TrainingJob("example",
training_job_name="example",
role_arn=example_aws_iam_role["arn"],
enable_managed_spot_training=True,
enable_network_isolation=True,
enable_inter_container_traffic_encryption=True,
algorithm_specification={
"training_input_mode": "File",
"training_image": training_image,
"container_entrypoints": [
"python",
"/opt/ml/code/train.py",
],
"container_arguments": [
"--epochs",
"10",
"--batch-size",
"128",
],
"metric_definitions": [
{
"name": "train:loss",
"regex": "loss: ([0-9\\.]+)",
},
{
"name": "validation:accuracy",
"regex": "accuracy: ([0-9\\.]+)",
},
],
},
environment={
"MODEL_DIR": "/opt/ml/model",
"SM_LOG_LEVEL": "20",
},
hyper_parameters={
"epochs": "10",
"batch_size": "128",
},
output_data_config={
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/output/",
},
resource_config={
"instance_type": "ml.m5.xlarge",
"instance_count": 1,
"volume_size_in_gb": 50,
"keep_alive_period_in_seconds": 600,
},
retry_strategy={
"maximum_retry_attempts": 3,
},
stopping_condition={
"max_runtime_in_seconds": 3600,
"max_wait_time_in_seconds": 7200,
},
tags={
"Environment": "test",
"Workload": "training",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/sagemaker"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := sagemaker.NewTrainingJob(ctx, "example", &sagemaker.TrainingJobArgs{
TrainingJobName: pulumi.String("example"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
EnableManagedSpotTraining: pulumi.Bool(true),
EnableNetworkIsolation: pulumi.Bool(true),
EnableInterContainerTrafficEncryption: pulumi.Bool(true),
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
TrainingInputMode: pulumi.String("File"),
TrainingImage: pulumi.Any(trainingImage),
ContainerEntrypoints: pulumi.StringArray{
pulumi.String("python"),
pulumi.String("/opt/ml/code/train.py"),
},
ContainerArguments: pulumi.StringArray{
pulumi.String("--epochs"),
pulumi.String("10"),
pulumi.String("--batch-size"),
pulumi.String("128"),
},
MetricDefinitions: sagemaker.TrainingJobAlgorithmSpecificationMetricDefinitionArray{
&sagemaker.TrainingJobAlgorithmSpecificationMetricDefinitionArgs{
Name: pulumi.String("train:loss"),
Regex: pulumi.String("loss: ([0-9\\.]+)"),
},
&sagemaker.TrainingJobAlgorithmSpecificationMetricDefinitionArgs{
Name: pulumi.String("validation:accuracy"),
Regex: pulumi.String("accuracy: ([0-9\\.]+)"),
},
},
},
Environment: pulumi.StringMap{
"MODEL_DIR": pulumi.String("/opt/ml/model"),
"SM_LOG_LEVEL": pulumi.String("20"),
},
HyperParameters: pulumi.StringMap{
"epochs": pulumi.String("10"),
"batch_size": pulumi.String("128"),
},
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
S3OutputPath: pulumi.Sprintf("s3://%v/output/", exampleAwsS3Bucket.Bucket),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceType: pulumi.String("ml.m5.xlarge"),
InstanceCount: pulumi.Int(1),
VolumeSizeInGb: pulumi.Int(50),
KeepAlivePeriodInSeconds: pulumi.Int(600),
},
RetryStrategy: &sagemaker.TrainingJobRetryStrategyArgs{
MaximumRetryAttempts: pulumi.Int(3),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxRuntimeInSeconds: pulumi.Int(3600),
MaxWaitTimeInSeconds: pulumi.Int(7200),
},
Tags: pulumi.StringMap{
"Environment": pulumi.String("test"),
"Workload": pulumi.String("training"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Sagemaker.TrainingJob("example", new()
{
TrainingJobName = "example",
RoleArn = exampleAwsIamRole.Arn,
EnableManagedSpotTraining = true,
EnableNetworkIsolation = true,
EnableInterContainerTrafficEncryption = true,
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
TrainingInputMode = "File",
TrainingImage = trainingImage,
ContainerEntrypoints = new[]
{
"python",
"/opt/ml/code/train.py",
},
ContainerArguments = new[]
{
"--epochs",
"10",
"--batch-size",
"128",
},
MetricDefinitions = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationMetricDefinitionArgs
{
Name = "train:loss",
Regex = "loss: ([0-9\\.]+)",
},
new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationMetricDefinitionArgs
{
Name = "validation:accuracy",
Regex = "accuracy: ([0-9\\.]+)",
},
},
},
Environment =
{
{ "MODEL_DIR", "/opt/ml/model" },
{ "SM_LOG_LEVEL", "20" },
},
HyperParameters =
{
{ "epochs", "10" },
{ "batch_size", "128" },
},
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/output/",
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceType = "ml.m5.xlarge",
InstanceCount = 1,
VolumeSizeInGb = 50,
KeepAlivePeriodInSeconds = 600,
},
RetryStrategy = new Aws.Sagemaker.Inputs.TrainingJobRetryStrategyArgs
{
MaximumRetryAttempts = 3,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxRuntimeInSeconds = 3600,
MaxWaitTimeInSeconds = 7200,
},
Tags =
{
{ "Environment", "test" },
{ "Workload", "training" },
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.sagemaker.TrainingJob;
import com.pulumi.aws.sagemaker.TrainingJobArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobAlgorithmSpecificationArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobOutputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobResourceConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobRetryStrategyArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobStoppingConditionArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new TrainingJob("example", TrainingJobArgs.builder()
.trainingJobName("example")
.roleArn(exampleAwsIamRole.arn())
.enableManagedSpotTraining(true)
.enableNetworkIsolation(true)
.enableInterContainerTrafficEncryption(true)
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.trainingInputMode("File")
.trainingImage(trainingImage)
.containerEntrypoints(
"python",
"/opt/ml/code/train.py")
.containerArguments(
"--epochs",
"10",
"--batch-size",
"128")
.metricDefinitions(
TrainingJobAlgorithmSpecificationMetricDefinitionArgs.builder()
.name("train:loss")
.regex("loss: ([0-9\\.]+)")
.build(),
TrainingJobAlgorithmSpecificationMetricDefinitionArgs.builder()
.name("validation:accuracy")
.regex("accuracy: ([0-9\\.]+)")
.build())
.build())
.environment(Map.ofEntries(
Map.entry("MODEL_DIR", "/opt/ml/model"),
Map.entry("SM_LOG_LEVEL", "20")
))
.hyperParameters(Map.ofEntries(
Map.entry("epochs", "10"),
Map.entry("batch_size", "128")
))
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.s3OutputPath(String.format("s3://%s/output/", exampleAwsS3Bucket.bucket()))
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceType("ml.m5.xlarge")
.instanceCount(1)
.volumeSizeInGb(50)
.keepAlivePeriodInSeconds(600)
.build())
.retryStrategy(TrainingJobRetryStrategyArgs.builder()
.maximumRetryAttempts(3)
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxRuntimeInSeconds(3600)
.maxWaitTimeInSeconds(7200)
.build())
.tags(Map.ofEntries(
Map.entry("Environment", "test"),
Map.entry("Workload", "training")
))
.build());
}
}
resources:
example:
type: aws:sagemaker:TrainingJob
properties:
trainingJobName: example
roleArn: ${exampleAwsIamRole.arn}
enableManagedSpotTraining: true
enableNetworkIsolation: true
enableInterContainerTrafficEncryption: true
algorithmSpecification:
trainingInputMode: File
trainingImage: ${trainingImage}
containerEntrypoints:
- python
- /opt/ml/code/train.py
containerArguments:
- --epochs
- '10'
- --batch-size
- '128'
metricDefinitions:
- name: train:loss
regex: 'loss: ([0-9\.]+)'
- name: validation:accuracy
regex: 'accuracy: ([0-9\.]+)'
environment:
MODEL_DIR: /opt/ml/model
SM_LOG_LEVEL: '20'
hyperParameters:
epochs: '10'
batch_size: '128'
outputDataConfig:
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/output/
resourceConfig:
instanceType: ml.m5.xlarge
instanceCount: 1
volumeSizeInGb: 50
keepAlivePeriodInSeconds: 600
retryStrategy:
maximumRetryAttempts: 3
stoppingCondition:
maxRuntimeInSeconds: 3600
maxWaitTimeInSeconds: 7200
tags:
Environment: test
Workload: training
With Multiple Input Channels, Infrastructure Checks, and Session Tag Chaining
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.sagemaker.TrainingJob("example", {
trainingJobName: "example",
roleArn: exampleAwsIamRole.arn,
algorithmSpecification: {
trainingInputMode: "File",
trainingImage: exampleAwsSagemakerPrebuiltEcrImage.registryPath,
},
inputDataConfigs: [
{
channelName: "train",
contentType: "text/csv",
inputMode: "File",
dataSource: {
s3DataSource: {
s3DataDistributionType: "FullyReplicated",
s3DataType: "S3Prefix",
s3Uri: `s3://${exampleAwsS3Bucket.bucket}/train/`,
},
},
},
{
channelName: "validation",
contentType: "text/csv",
inputMode: "File",
dataSource: {
s3DataSource: {
s3DataDistributionType: "FullyReplicated",
s3DataType: "S3Prefix",
s3Uri: `s3://${exampleAwsS3Bucket.bucket}/validation/`,
},
},
},
],
infraCheckConfig: {
enableInfraCheck: true,
},
outputDataConfig: {
s3OutputPath: `s3://${exampleAwsS3Bucket.bucket}/output/`,
},
resourceConfig: {
instanceType: "ml.m5.large",
instanceCount: 1,
volumeSizeInGb: 30,
},
sessionChainingConfig: {
enableSessionTagChaining: true,
},
stoppingCondition: {
maxRuntimeInSeconds: 3600,
},
});
import pulumi
import pulumi_aws as aws
example = aws.sagemaker.TrainingJob("example",
training_job_name="example",
role_arn=example_aws_iam_role["arn"],
algorithm_specification={
"training_input_mode": "File",
"training_image": example_aws_sagemaker_prebuilt_ecr_image["registryPath"],
},
input_data_configs=[
{
"channel_name": "train",
"content_type": "text/csv",
"input_mode": "File",
"data_source": {
"s3_data_source": {
"s3_data_distribution_type": "FullyReplicated",
"s3_data_type": "S3Prefix",
"s3_uri": f"s3://{example_aws_s3_bucket['bucket']}/train/",
},
},
},
{
"channel_name": "validation",
"content_type": "text/csv",
"input_mode": "File",
"data_source": {
"s3_data_source": {
"s3_data_distribution_type": "FullyReplicated",
"s3_data_type": "S3Prefix",
"s3_uri": f"s3://{example_aws_s3_bucket['bucket']}/validation/",
},
},
},
],
infra_check_config={
"enable_infra_check": True,
},
output_data_config={
"s3_output_path": f"s3://{example_aws_s3_bucket['bucket']}/output/",
},
resource_config={
"instance_type": "ml.m5.large",
"instance_count": 1,
"volume_size_in_gb": 30,
},
session_chaining_config={
"enable_session_tag_chaining": True,
},
stopping_condition={
"max_runtime_in_seconds": 3600,
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/sagemaker"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := sagemaker.NewTrainingJob(ctx, "example", &sagemaker.TrainingJobArgs{
TrainingJobName: pulumi.String("example"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
TrainingInputMode: pulumi.String("File"),
TrainingImage: pulumi.Any(exampleAwsSagemakerPrebuiltEcrImage.RegistryPath),
},
InputDataConfigs: sagemaker.TrainingJobInputDataConfigArray{
&sagemaker.TrainingJobInputDataConfigArgs{
ChannelName: pulumi.String("train"),
ContentType: pulumi.String("text/csv"),
InputMode: pulumi.String("File"),
DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceArgs{
S3DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceS3DataSourceArgs{
S3DataDistributionType: pulumi.String("FullyReplicated"),
S3DataType: pulumi.String("S3Prefix"),
S3Uri: pulumi.Sprintf("s3://%v/train/", exampleAwsS3Bucket.Bucket),
},
},
},
&sagemaker.TrainingJobInputDataConfigArgs{
ChannelName: pulumi.String("validation"),
ContentType: pulumi.String("text/csv"),
InputMode: pulumi.String("File"),
DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceArgs{
S3DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceS3DataSourceArgs{
S3DataDistributionType: pulumi.String("FullyReplicated"),
S3DataType: pulumi.String("S3Prefix"),
S3Uri: pulumi.Sprintf("s3://%v/validation/", exampleAwsS3Bucket.Bucket),
},
},
},
},
InfraCheckConfig: &sagemaker.TrainingJobInfraCheckConfigArgs{
EnableInfraCheck: pulumi.Bool(true),
},
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
S3OutputPath: pulumi.Sprintf("s3://%v/output/", exampleAwsS3Bucket.Bucket),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceType: pulumi.String("ml.m5.large"),
InstanceCount: pulumi.Int(1),
VolumeSizeInGb: pulumi.Int(30),
},
SessionChainingConfig: &sagemaker.TrainingJobSessionChainingConfigArgs{
EnableSessionTagChaining: pulumi.Bool(true),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxRuntimeInSeconds: pulumi.Int(3600),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Sagemaker.TrainingJob("example", new()
{
TrainingJobName = "example",
RoleArn = exampleAwsIamRole.Arn,
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
TrainingInputMode = "File",
TrainingImage = exampleAwsSagemakerPrebuiltEcrImage.RegistryPath,
},
InputDataConfigs = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigArgs
{
ChannelName = "train",
ContentType = "text/csv",
InputMode = "File",
DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceArgs
{
S3DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceS3DataSourceArgs
{
S3DataDistributionType = "FullyReplicated",
S3DataType = "S3Prefix",
S3Uri = $"s3://{exampleAwsS3Bucket.Bucket}/train/",
},
},
},
new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigArgs
{
ChannelName = "validation",
ContentType = "text/csv",
InputMode = "File",
DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceArgs
{
S3DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceS3DataSourceArgs
{
S3DataDistributionType = "FullyReplicated",
S3DataType = "S3Prefix",
S3Uri = $"s3://{exampleAwsS3Bucket.Bucket}/validation/",
},
},
},
},
InfraCheckConfig = new Aws.Sagemaker.Inputs.TrainingJobInfraCheckConfigArgs
{
EnableInfraCheck = true,
},
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
S3OutputPath = $"s3://{exampleAwsS3Bucket.Bucket}/output/",
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceType = "ml.m5.large",
InstanceCount = 1,
VolumeSizeInGb = 30,
},
SessionChainingConfig = new Aws.Sagemaker.Inputs.TrainingJobSessionChainingConfigArgs
{
EnableSessionTagChaining = true,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxRuntimeInSeconds = 3600,
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.sagemaker.TrainingJob;
import com.pulumi.aws.sagemaker.TrainingJobArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobAlgorithmSpecificationArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInputDataConfigDataSourceArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInputDataConfigDataSourceS3DataSourceArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobInfraCheckConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobOutputDataConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobResourceConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobSessionChainingConfigArgs;
import com.pulumi.aws.sagemaker.inputs.TrainingJobStoppingConditionArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new TrainingJob("example", TrainingJobArgs.builder()
.trainingJobName("example")
.roleArn(exampleAwsIamRole.arn())
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.trainingInputMode("File")
.trainingImage(exampleAwsSagemakerPrebuiltEcrImage.registryPath())
.build())
.inputDataConfigs(
TrainingJobInputDataConfigArgs.builder()
.channelName("train")
.contentType("text/csv")
.inputMode("File")
.dataSource(TrainingJobInputDataConfigDataSourceArgs.builder()
.s3DataSource(TrainingJobInputDataConfigDataSourceS3DataSourceArgs.builder()
.s3DataDistributionType("FullyReplicated")
.s3DataType("S3Prefix")
.s3Uri(String.format("s3://%s/train/", exampleAwsS3Bucket.bucket()))
.build())
.build())
.build(),
TrainingJobInputDataConfigArgs.builder()
.channelName("validation")
.contentType("text/csv")
.inputMode("File")
.dataSource(TrainingJobInputDataConfigDataSourceArgs.builder()
.s3DataSource(TrainingJobInputDataConfigDataSourceS3DataSourceArgs.builder()
.s3DataDistributionType("FullyReplicated")
.s3DataType("S3Prefix")
.s3Uri(String.format("s3://%s/validation/", exampleAwsS3Bucket.bucket()))
.build())
.build())
.build())
.infraCheckConfig(TrainingJobInfraCheckConfigArgs.builder()
.enableInfraCheck(true)
.build())
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.s3OutputPath(String.format("s3://%s/output/", exampleAwsS3Bucket.bucket()))
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceType("ml.m5.large")
.instanceCount(1)
.volumeSizeInGb(30)
.build())
.sessionChainingConfig(TrainingJobSessionChainingConfigArgs.builder()
.enableSessionTagChaining(true)
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxRuntimeInSeconds(3600)
.build())
.build());
}
}
resources:
example:
type: aws:sagemaker:TrainingJob
properties:
trainingJobName: example
roleArn: ${exampleAwsIamRole.arn}
algorithmSpecification:
trainingInputMode: File
trainingImage: ${exampleAwsSagemakerPrebuiltEcrImage.registryPath}
inputDataConfigs:
- channelName: train
contentType: text/csv
inputMode: File
dataSource:
s3DataSource:
s3DataDistributionType: FullyReplicated
s3DataType: S3Prefix
s3Uri: s3://${exampleAwsS3Bucket.bucket}/train/
- channelName: validation
contentType: text/csv
inputMode: File
dataSource:
s3DataSource:
s3DataDistributionType: FullyReplicated
s3DataType: S3Prefix
s3Uri: s3://${exampleAwsS3Bucket.bucket}/validation/
infraCheckConfig:
enableInfraCheck: true
outputDataConfig:
s3OutputPath: s3://${exampleAwsS3Bucket.bucket}/output/
resourceConfig:
instanceType: ml.m5.large
instanceCount: 1
volumeSizeInGb: 30
sessionChainingConfig:
enableSessionTagChaining: true
stoppingCondition:
maxRuntimeInSeconds: 3600
Create TrainingJob Resource
Resources are created with functions called constructors. To learn more about declaring and configuring resources, see Resources.
Constructor syntax
new TrainingJob(name: string, args: TrainingJobArgs, opts?: CustomResourceOptions);@overload
def TrainingJob(resource_name: str,
args: TrainingJobArgs,
opts: Optional[ResourceOptions] = None)
@overload
def TrainingJob(resource_name: str,
opts: Optional[ResourceOptions] = None,
role_arn: Optional[str] = None,
training_job_name: Optional[str] = None,
output_data_config: Optional[TrainingJobOutputDataConfigArgs] = None,
retry_strategy: Optional[TrainingJobRetryStrategyArgs] = None,
delete_model_packages_on_destroy: Optional[bool] = None,
delete_vpc_enis_on_destroy: Optional[bool] = None,
enable_inter_container_traffic_encryption: Optional[bool] = None,
enable_managed_spot_training: Optional[bool] = None,
enable_network_isolation: Optional[bool] = None,
environment: Optional[Mapping[str, str]] = None,
experiment_config: Optional[TrainingJobExperimentConfigArgs] = None,
hyper_parameters: Optional[Mapping[str, str]] = None,
infra_check_config: Optional[TrainingJobInfraCheckConfigArgs] = None,
input_data_configs: Optional[Sequence[TrainingJobInputDataConfigArgs]] = None,
mlflow_config: Optional[TrainingJobMlflowConfigArgs] = None,
model_package_config: Optional[TrainingJobModelPackageConfigArgs] = None,
debug_rule_configurations: Optional[Sequence[TrainingJobDebugRuleConfigurationArgs]] = None,
algorithm_specification: Optional[TrainingJobAlgorithmSpecificationArgs] = None,
serverless_job_config: Optional[TrainingJobServerlessJobConfigArgs] = None,
region: Optional[str] = None,
remote_debug_config: Optional[TrainingJobRemoteDebugConfigArgs] = None,
resource_config: Optional[TrainingJobResourceConfigArgs] = None,
profiler_config: Optional[TrainingJobProfilerConfigArgs] = None,
debug_hook_config: Optional[TrainingJobDebugHookConfigArgs] = None,
profiler_rule_configurations: Optional[Sequence[TrainingJobProfilerRuleConfigurationArgs]] = None,
session_chaining_config: Optional[TrainingJobSessionChainingConfigArgs] = None,
stopping_condition: Optional[TrainingJobStoppingConditionArgs] = None,
tags: Optional[Mapping[str, str]] = None,
tensor_board_output_config: Optional[TrainingJobTensorBoardOutputConfigArgs] = None,
timeouts: Optional[TrainingJobTimeoutsArgs] = None,
checkpoint_config: Optional[TrainingJobCheckpointConfigArgs] = None,
vpc_config: Optional[TrainingJobVpcConfigArgs] = None)func NewTrainingJob(ctx *Context, name string, args TrainingJobArgs, opts ...ResourceOption) (*TrainingJob, error)public TrainingJob(string name, TrainingJobArgs args, CustomResourceOptions? opts = null)
public TrainingJob(String name, TrainingJobArgs args)
public TrainingJob(String name, TrainingJobArgs args, CustomResourceOptions options)
type: aws:sagemaker:TrainingJob
properties: # The arguments to resource properties.
options: # Bag of options to control resource's behavior.
Parameters
- name string
- The unique name of the resource.
- args TrainingJobArgs
- The arguments to resource properties.
- opts CustomResourceOptions
- Bag of options to control resource's behavior.
- resource_name str
- The unique name of the resource.
- args TrainingJobArgs
- The arguments to resource properties.
- opts ResourceOptions
- Bag of options to control resource's behavior.
- ctx Context
- Context object for the current deployment.
- name string
- The unique name of the resource.
- args TrainingJobArgs
- The arguments to resource properties.
- opts ResourceOption
- Bag of options to control resource's behavior.
- name string
- The unique name of the resource.
- args TrainingJobArgs
- The arguments to resource properties.
- opts CustomResourceOptions
- Bag of options to control resource's behavior.
- name String
- The unique name of the resource.
- args TrainingJobArgs
- The arguments to resource properties.
- options CustomResourceOptions
- Bag of options to control resource's behavior.
Constructor example
The following reference example uses placeholder values for all input properties.
var trainingJobResource = new Aws.Sagemaker.TrainingJob("trainingJobResource", new()
{
RoleArn = "string",
TrainingJobName = "string",
OutputDataConfig = new Aws.Sagemaker.Inputs.TrainingJobOutputDataConfigArgs
{
S3OutputPath = "string",
CompressionType = "string",
KmsKeyId = "string",
},
RetryStrategy = new Aws.Sagemaker.Inputs.TrainingJobRetryStrategyArgs
{
MaximumRetryAttempts = 0,
},
DeleteModelPackagesOnDestroy = false,
DeleteVpcEnisOnDestroy = false,
EnableInterContainerTrafficEncryption = false,
EnableManagedSpotTraining = false,
EnableNetworkIsolation = false,
Environment =
{
{ "string", "string" },
},
ExperimentConfig = new Aws.Sagemaker.Inputs.TrainingJobExperimentConfigArgs
{
ExperimentName = "string",
RunName = "string",
TrialComponentDisplayName = "string",
TrialName = "string",
},
HyperParameters =
{
{ "string", "string" },
},
InfraCheckConfig = new Aws.Sagemaker.Inputs.TrainingJobInfraCheckConfigArgs
{
EnableInfraCheck = false,
},
InputDataConfigs = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigArgs
{
ChannelName = "string",
CompressionType = "string",
ContentType = "string",
DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceArgs
{
FileSystemDataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceFileSystemDataSourceArgs
{
DirectoryPath = "string",
FileSystemAccessMode = "string",
FileSystemId = "string",
FileSystemType = "string",
},
S3DataSource = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceS3DataSourceArgs
{
S3DataType = "string",
S3Uri = "string",
AttributeNames = new[]
{
"string",
},
HubAccessConfig = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceS3DataSourceHubAccessConfigArgs
{
HubContentArn = "string",
},
InstanceGroupNames = new[]
{
"string",
},
ModelAccessConfig = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigDataSourceS3DataSourceModelAccessConfigArgs
{
AcceptEula = false,
},
S3DataDistributionType = "string",
},
},
InputMode = "string",
RecordWrapperType = "string",
ShuffleConfig = new Aws.Sagemaker.Inputs.TrainingJobInputDataConfigShuffleConfigArgs
{
Seed = 0,
},
},
},
MlflowConfig = new Aws.Sagemaker.Inputs.TrainingJobMlflowConfigArgs
{
MlflowResourceArn = "string",
MlflowExperimentName = "string",
MlflowRunName = "string",
},
ModelPackageConfig = new Aws.Sagemaker.Inputs.TrainingJobModelPackageConfigArgs
{
ModelPackageGroupArn = "string",
SourceModelPackageArn = "string",
},
DebugRuleConfigurations = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobDebugRuleConfigurationArgs
{
RuleConfigurationName = "string",
RuleEvaluatorImage = "string",
InstanceType = "string",
LocalPath = "string",
RuleParameters =
{
{ "string", "string" },
},
S3OutputPath = "string",
VolumeSizeInGb = 0,
},
},
AlgorithmSpecification = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationArgs
{
AlgorithmName = "string",
ContainerArguments = new[]
{
"string",
},
ContainerEntrypoints = new[]
{
"string",
},
EnableSagemakerMetricsTimeSeries = false,
MetricDefinitions = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationMetricDefinitionArgs
{
Name = "string",
Regex = "string",
},
},
TrainingImage = "string",
TrainingImageConfig = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationTrainingImageConfigArgs
{
TrainingRepositoryAccessMode = "string",
TrainingRepositoryAuthConfig = new Aws.Sagemaker.Inputs.TrainingJobAlgorithmSpecificationTrainingImageConfigTrainingRepositoryAuthConfigArgs
{
TrainingRepositoryCredentialsProviderArn = "string",
},
},
TrainingInputMode = "string",
},
ServerlessJobConfig = new Aws.Sagemaker.Inputs.TrainingJobServerlessJobConfigArgs
{
BaseModelArn = "string",
JobType = "string",
AcceptEula = false,
CustomizationTechnique = "string",
EvaluationType = "string",
EvaluatorArn = "string",
Peft = "string",
},
Region = "string",
RemoteDebugConfig = new Aws.Sagemaker.Inputs.TrainingJobRemoteDebugConfigArgs
{
EnableRemoteDebug = false,
},
ResourceConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigArgs
{
InstanceCount = 0,
InstanceGroups = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobResourceConfigInstanceGroupArgs
{
InstanceCount = 0,
InstanceGroupName = "string",
InstanceType = "string",
},
},
InstancePlacementConfig = new Aws.Sagemaker.Inputs.TrainingJobResourceConfigInstancePlacementConfigArgs
{
EnableMultipleJobs = false,
PlacementSpecifications = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobResourceConfigInstancePlacementConfigPlacementSpecificationArgs
{
InstanceCount = 0,
UltraServerId = "string",
},
},
},
InstanceType = "string",
KeepAlivePeriodInSeconds = 0,
TrainingPlanArn = "string",
VolumeKmsKeyId = "string",
VolumeSizeInGb = 0,
},
ProfilerConfig = new Aws.Sagemaker.Inputs.TrainingJobProfilerConfigArgs
{
DisableProfiler = false,
ProfilingIntervalInMilliseconds = 0,
ProfilingParameters =
{
{ "string", "string" },
},
S3OutputPath = "string",
},
DebugHookConfig = new Aws.Sagemaker.Inputs.TrainingJobDebugHookConfigArgs
{
S3OutputPath = "string",
CollectionConfigurations = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobDebugHookConfigCollectionConfigurationArgs
{
CollectionName = "string",
CollectionParameters =
{
{ "string", "string" },
},
},
},
HookParameters =
{
{ "string", "string" },
},
LocalPath = "string",
},
ProfilerRuleConfigurations = new[]
{
new Aws.Sagemaker.Inputs.TrainingJobProfilerRuleConfigurationArgs
{
RuleConfigurationName = "string",
RuleEvaluatorImage = "string",
InstanceType = "string",
LocalPath = "string",
RuleParameters =
{
{ "string", "string" },
},
S3OutputPath = "string",
VolumeSizeInGb = 0,
},
},
SessionChainingConfig = new Aws.Sagemaker.Inputs.TrainingJobSessionChainingConfigArgs
{
EnableSessionTagChaining = false,
},
StoppingCondition = new Aws.Sagemaker.Inputs.TrainingJobStoppingConditionArgs
{
MaxPendingTimeInSeconds = 0,
MaxRuntimeInSeconds = 0,
MaxWaitTimeInSeconds = 0,
},
Tags =
{
{ "string", "string" },
},
TensorBoardOutputConfig = new Aws.Sagemaker.Inputs.TrainingJobTensorBoardOutputConfigArgs
{
S3OutputPath = "string",
LocalPath = "string",
},
Timeouts = new Aws.Sagemaker.Inputs.TrainingJobTimeoutsArgs
{
Create = "string",
Delete = "string",
Update = "string",
},
CheckpointConfig = new Aws.Sagemaker.Inputs.TrainingJobCheckpointConfigArgs
{
S3Uri = "string",
LocalPath = "string",
},
VpcConfig = new Aws.Sagemaker.Inputs.TrainingJobVpcConfigArgs
{
SecurityGroupIds = new[]
{
"string",
},
Subnets = new[]
{
"string",
},
},
});
example, err := sagemaker.NewTrainingJob(ctx, "trainingJobResource", &sagemaker.TrainingJobArgs{
RoleArn: pulumi.String("string"),
TrainingJobName: pulumi.String("string"),
OutputDataConfig: &sagemaker.TrainingJobOutputDataConfigArgs{
S3OutputPath: pulumi.String("string"),
CompressionType: pulumi.String("string"),
KmsKeyId: pulumi.String("string"),
},
RetryStrategy: &sagemaker.TrainingJobRetryStrategyArgs{
MaximumRetryAttempts: pulumi.Int(0),
},
DeleteModelPackagesOnDestroy: pulumi.Bool(false),
DeleteVpcEnisOnDestroy: pulumi.Bool(false),
EnableInterContainerTrafficEncryption: pulumi.Bool(false),
EnableManagedSpotTraining: pulumi.Bool(false),
EnableNetworkIsolation: pulumi.Bool(false),
Environment: pulumi.StringMap{
"string": pulumi.String("string"),
},
ExperimentConfig: &sagemaker.TrainingJobExperimentConfigArgs{
ExperimentName: pulumi.String("string"),
RunName: pulumi.String("string"),
TrialComponentDisplayName: pulumi.String("string"),
TrialName: pulumi.String("string"),
},
HyperParameters: pulumi.StringMap{
"string": pulumi.String("string"),
},
InfraCheckConfig: &sagemaker.TrainingJobInfraCheckConfigArgs{
EnableInfraCheck: pulumi.Bool(false),
},
InputDataConfigs: sagemaker.TrainingJobInputDataConfigArray{
&sagemaker.TrainingJobInputDataConfigArgs{
ChannelName: pulumi.String("string"),
CompressionType: pulumi.String("string"),
ContentType: pulumi.String("string"),
DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceArgs{
FileSystemDataSource: &sagemaker.TrainingJobInputDataConfigDataSourceFileSystemDataSourceArgs{
DirectoryPath: pulumi.String("string"),
FileSystemAccessMode: pulumi.String("string"),
FileSystemId: pulumi.String("string"),
FileSystemType: pulumi.String("string"),
},
S3DataSource: &sagemaker.TrainingJobInputDataConfigDataSourceS3DataSourceArgs{
S3DataType: pulumi.String("string"),
S3Uri: pulumi.String("string"),
AttributeNames: pulumi.StringArray{
pulumi.String("string"),
},
HubAccessConfig: &sagemaker.TrainingJobInputDataConfigDataSourceS3DataSourceHubAccessConfigArgs{
HubContentArn: pulumi.String("string"),
},
InstanceGroupNames: pulumi.StringArray{
pulumi.String("string"),
},
ModelAccessConfig: &sagemaker.TrainingJobInputDataConfigDataSourceS3DataSourceModelAccessConfigArgs{
AcceptEula: pulumi.Bool(false),
},
S3DataDistributionType: pulumi.String("string"),
},
},
InputMode: pulumi.String("string"),
RecordWrapperType: pulumi.String("string"),
ShuffleConfig: &sagemaker.TrainingJobInputDataConfigShuffleConfigArgs{
Seed: pulumi.Int(0),
},
},
},
MlflowConfig: &sagemaker.TrainingJobMlflowConfigArgs{
MlflowResourceArn: pulumi.String("string"),
MlflowExperimentName: pulumi.String("string"),
MlflowRunName: pulumi.String("string"),
},
ModelPackageConfig: &sagemaker.TrainingJobModelPackageConfigArgs{
ModelPackageGroupArn: pulumi.String("string"),
SourceModelPackageArn: pulumi.String("string"),
},
DebugRuleConfigurations: sagemaker.TrainingJobDebugRuleConfigurationArray{
&sagemaker.TrainingJobDebugRuleConfigurationArgs{
RuleConfigurationName: pulumi.String("string"),
RuleEvaluatorImage: pulumi.String("string"),
InstanceType: pulumi.String("string"),
LocalPath: pulumi.String("string"),
RuleParameters: pulumi.StringMap{
"string": pulumi.String("string"),
},
S3OutputPath: pulumi.String("string"),
VolumeSizeInGb: pulumi.Int(0),
},
},
AlgorithmSpecification: &sagemaker.TrainingJobAlgorithmSpecificationArgs{
AlgorithmName: pulumi.String("string"),
ContainerArguments: pulumi.StringArray{
pulumi.String("string"),
},
ContainerEntrypoints: pulumi.StringArray{
pulumi.String("string"),
},
EnableSagemakerMetricsTimeSeries: pulumi.Bool(false),
MetricDefinitions: sagemaker.TrainingJobAlgorithmSpecificationMetricDefinitionArray{
&sagemaker.TrainingJobAlgorithmSpecificationMetricDefinitionArgs{
Name: pulumi.String("string"),
Regex: pulumi.String("string"),
},
},
TrainingImage: pulumi.String("string"),
TrainingImageConfig: &sagemaker.TrainingJobAlgorithmSpecificationTrainingImageConfigArgs{
TrainingRepositoryAccessMode: pulumi.String("string"),
TrainingRepositoryAuthConfig: &sagemaker.TrainingJobAlgorithmSpecificationTrainingImageConfigTrainingRepositoryAuthConfigArgs{
TrainingRepositoryCredentialsProviderArn: pulumi.String("string"),
},
},
TrainingInputMode: pulumi.String("string"),
},
ServerlessJobConfig: &sagemaker.TrainingJobServerlessJobConfigArgs{
BaseModelArn: pulumi.String("string"),
JobType: pulumi.String("string"),
AcceptEula: pulumi.Bool(false),
CustomizationTechnique: pulumi.String("string"),
EvaluationType: pulumi.String("string"),
EvaluatorArn: pulumi.String("string"),
Peft: pulumi.String("string"),
},
Region: pulumi.String("string"),
RemoteDebugConfig: &sagemaker.TrainingJobRemoteDebugConfigArgs{
EnableRemoteDebug: pulumi.Bool(false),
},
ResourceConfig: &sagemaker.TrainingJobResourceConfigArgs{
InstanceCount: pulumi.Int(0),
InstanceGroups: sagemaker.TrainingJobResourceConfigInstanceGroupArray{
&sagemaker.TrainingJobResourceConfigInstanceGroupArgs{
InstanceCount: pulumi.Int(0),
InstanceGroupName: pulumi.String("string"),
InstanceType: pulumi.String("string"),
},
},
InstancePlacementConfig: &sagemaker.TrainingJobResourceConfigInstancePlacementConfigArgs{
EnableMultipleJobs: pulumi.Bool(false),
PlacementSpecifications: sagemaker.TrainingJobResourceConfigInstancePlacementConfigPlacementSpecificationArray{
&sagemaker.TrainingJobResourceConfigInstancePlacementConfigPlacementSpecificationArgs{
InstanceCount: pulumi.Int(0),
UltraServerId: pulumi.String("string"),
},
},
},
InstanceType: pulumi.String("string"),
KeepAlivePeriodInSeconds: pulumi.Int(0),
TrainingPlanArn: pulumi.String("string"),
VolumeKmsKeyId: pulumi.String("string"),
VolumeSizeInGb: pulumi.Int(0),
},
ProfilerConfig: &sagemaker.TrainingJobProfilerConfigArgs{
DisableProfiler: pulumi.Bool(false),
ProfilingIntervalInMilliseconds: pulumi.Int(0),
ProfilingParameters: pulumi.StringMap{
"string": pulumi.String("string"),
},
S3OutputPath: pulumi.String("string"),
},
DebugHookConfig: &sagemaker.TrainingJobDebugHookConfigArgs{
S3OutputPath: pulumi.String("string"),
CollectionConfigurations: sagemaker.TrainingJobDebugHookConfigCollectionConfigurationArray{
&sagemaker.TrainingJobDebugHookConfigCollectionConfigurationArgs{
CollectionName: pulumi.String("string"),
CollectionParameters: pulumi.StringMap{
"string": pulumi.String("string"),
},
},
},
HookParameters: pulumi.StringMap{
"string": pulumi.String("string"),
},
LocalPath: pulumi.String("string"),
},
ProfilerRuleConfigurations: sagemaker.TrainingJobProfilerRuleConfigurationArray{
&sagemaker.TrainingJobProfilerRuleConfigurationArgs{
RuleConfigurationName: pulumi.String("string"),
RuleEvaluatorImage: pulumi.String("string"),
InstanceType: pulumi.String("string"),
LocalPath: pulumi.String("string"),
RuleParameters: pulumi.StringMap{
"string": pulumi.String("string"),
},
S3OutputPath: pulumi.String("string"),
VolumeSizeInGb: pulumi.Int(0),
},
},
SessionChainingConfig: &sagemaker.TrainingJobSessionChainingConfigArgs{
EnableSessionTagChaining: pulumi.Bool(false),
},
StoppingCondition: &sagemaker.TrainingJobStoppingConditionArgs{
MaxPendingTimeInSeconds: pulumi.Int(0),
MaxRuntimeInSeconds: pulumi.Int(0),
MaxWaitTimeInSeconds: pulumi.Int(0),
},
Tags: pulumi.StringMap{
"string": pulumi.String("string"),
},
TensorBoardOutputConfig: &sagemaker.TrainingJobTensorBoardOutputConfigArgs{
S3OutputPath: pulumi.String("string"),
LocalPath: pulumi.String("string"),
},
Timeouts: &sagemaker.TrainingJobTimeoutsArgs{
Create: pulumi.String("string"),
Delete: pulumi.String("string"),
Update: pulumi.String("string"),
},
CheckpointConfig: &sagemaker.TrainingJobCheckpointConfigArgs{
S3Uri: pulumi.String("string"),
LocalPath: pulumi.String("string"),
},
VpcConfig: &sagemaker.TrainingJobVpcConfigArgs{
SecurityGroupIds: pulumi.StringArray{
pulumi.String("string"),
},
Subnets: pulumi.StringArray{
pulumi.String("string"),
},
},
})
var trainingJobResource = new TrainingJob("trainingJobResource", TrainingJobArgs.builder()
.roleArn("string")
.trainingJobName("string")
.outputDataConfig(TrainingJobOutputDataConfigArgs.builder()
.s3OutputPath("string")
.compressionType("string")
.kmsKeyId("string")
.build())
.retryStrategy(TrainingJobRetryStrategyArgs.builder()
.maximumRetryAttempts(0)
.build())
.deleteModelPackagesOnDestroy(false)
.deleteVpcEnisOnDestroy(false)
.enableInterContainerTrafficEncryption(false)
.enableManagedSpotTraining(false)
.enableNetworkIsolation(false)
.environment(Map.of("string", "string"))
.experimentConfig(TrainingJobExperimentConfigArgs.builder()
.experimentName("string")
.runName("string")
.trialComponentDisplayName("string")
.trialName("string")
.build())
.hyperParameters(Map.of("string", "string"))
.infraCheckConfig(TrainingJobInfraCheckConfigArgs.builder()
.enableInfraCheck(false)
.build())
.inputDataConfigs(TrainingJobInputDataConfigArgs.builder()
.channelName("string")
.compressionType("string")
.contentType("string")
.dataSource(TrainingJobInputDataConfigDataSourceArgs.builder()
.fileSystemDataSource(TrainingJobInputDataConfigDataSourceFileSystemDataSourceArgs.builder()
.directoryPath("string")
.fileSystemAccessMode("string")
.fileSystemId("string")
.fileSystemType("string")
.build())
.s3DataSource(TrainingJobInputDataConfigDataSourceS3DataSourceArgs.builder()
.s3DataType("string")
.s3Uri("string")
.attributeNames("string")
.hubAccessConfig(TrainingJobInputDataConfigDataSourceS3DataSourceHubAccessConfigArgs.builder()
.hubContentArn("string")
.build())
.instanceGroupNames("string")
.modelAccessConfig(TrainingJobInputDataConfigDataSourceS3DataSourceModelAccessConfigArgs.builder()
.acceptEula(false)
.build())
.s3DataDistributionType("string")
.build())
.build())
.inputMode("string")
.recordWrapperType("string")
.shuffleConfig(TrainingJobInputDataConfigShuffleConfigArgs.builder()
.seed(0)
.build())
.build())
.mlflowConfig(TrainingJobMlflowConfigArgs.builder()
.mlflowResourceArn("string")
.mlflowExperimentName("string")
.mlflowRunName("string")
.build())
.modelPackageConfig(TrainingJobModelPackageConfigArgs.builder()
.modelPackageGroupArn("string")
.sourceModelPackageArn("string")
.build())
.debugRuleConfigurations(TrainingJobDebugRuleConfigurationArgs.builder()
.ruleConfigurationName("string")
.ruleEvaluatorImage("string")
.instanceType("string")
.localPath("string")
.ruleParameters(Map.of("string", "string"))
.s3OutputPath("string")
.volumeSizeInGb(0)
.build())
.algorithmSpecification(TrainingJobAlgorithmSpecificationArgs.builder()
.algorithmName("string")
.containerArguments("string")
.containerEntrypoints("string")
.enableSagemakerMetricsTimeSeries(false)
.metricDefinitions(TrainingJobAlgorithmSpecificationMetricDefinitionArgs.builder()
.name("string")
.regex("string")
.build())
.trainingImage("string")
.trainingImageConfig(TrainingJobAlgorithmSpecificationTrainingImageConfigArgs.builder()
.trainingRepositoryAccessMode("string")
.trainingRepositoryAuthConfig(TrainingJobAlgorithmSpecificationTrainingImageConfigTrainingRepositoryAuthConfigArgs.builder()
.trainingRepositoryCredentialsProviderArn("string")
.build())
.build())
.trainingInputMode("string")
.build())
.serverlessJobConfig(TrainingJobServerlessJobConfigArgs.builder()
.baseModelArn("string")
.jobType("string")
.acceptEula(false)
.customizationTechnique("string")
.evaluationType("string")
.evaluatorArn("string")
.peft("string")
.build())
.region("string")
.remoteDebugConfig(TrainingJobRemoteDebugConfigArgs.builder()
.enableRemoteDebug(false)
.build())
.resourceConfig(TrainingJobResourceConfigArgs.builder()
.instanceCount(0)
.instanceGroups(TrainingJobResourceConfigInstanceGroupArgs.builder()
.instanceCount(0)
.instanceGroupName("string")
.instanceType("string")
.build())
.instancePlacementConfig(TrainingJobResourceConfigInstancePlacementConfigArgs.builder()
.enableMultipleJobs(false)
.placementSpecifications(TrainingJobResourceConfigInstancePlacementConfigPlacementSpecificationArgs.builder()
.instanceCount(0)
.ultraServerId("string")
.build())
.build())
.instanceType("string")
.keepAlivePeriodInSeconds(0)
.trainingPlanArn("string")
.volumeKmsKeyId("string")
.volumeSizeInGb(0)
.build())
.profilerConfig(TrainingJobProfilerConfigArgs.builder()
.disableProfiler(false)
.profilingIntervalInMilliseconds(0)
.profilingParameters(Map.of("string", "string"))
.s3OutputPath("string")
.build())
.debugHookConfig(TrainingJobDebugHookConfigArgs.builder()
.s3OutputPath("string")
.collectionConfigurations(TrainingJobDebugHookConfigCollectionConfigurationArgs.builder()
.collectionName("string")
.collectionParameters(Map.of("string", "string"))
.build())
.hookParameters(Map.of("string", "string"))
.localPath("string")
.build())
.profilerRuleConfigurations(TrainingJobProfilerRuleConfigurationArgs.builder()
.ruleConfigurationName("string")
.ruleEvaluatorImage("string")
.instanceType("string")
.localPath("string")
.ruleParameters(Map.of("string", "string"))
.s3OutputPath("string")
.volumeSizeInGb(0)
.build())
.sessionChainingConfig(TrainingJobSessionChainingConfigArgs.builder()
.enableSessionTagChaining(false)
.build())
.stoppingCondition(TrainingJobStoppingConditionArgs.builder()
.maxPendingTimeInSeconds(0)
.maxRuntimeInSeconds(0)
.maxWaitTimeInSeconds(0)
.build())
.tags(Map.of("string", "string"))
.tensorBoardOutputConfig(TrainingJobTensorBoardOutputConfigArgs.builder()
.s3OutputPath("string")
.localPath("string")
.build())
.timeouts(TrainingJobTimeoutsArgs.builder()
.create("string")
.delete("string")
.update("string")
.build())
.checkpointConfig(TrainingJobCheckpointConfigArgs.builder()
.s3Uri("string")
.localPath("string")
.build())
.vpcConfig(TrainingJobVpcConfigArgs.builder()
.securityGroupIds("string")
.subnets("string")
.build())
.build());
training_job_resource = aws.sagemaker.TrainingJob("trainingJobResource",
role_arn="string",
training_job_name="string",
output_data_config={
"s3_output_path": "string",
"compression_type": "string",
"kms_key_id": "string",
},
retry_strategy={
"maximum_retry_attempts": 0,
},
delete_model_packages_on_destroy=False,
delete_vpc_enis_on_destroy=False,
enable_inter_container_traffic_encryption=False,
enable_managed_spot_training=False,
enable_network_isolation=False,
environment={
"string": "string",
},
experiment_config={
"experiment_name": "string",
"run_name": "string",
"trial_component_display_name": "string",
"trial_name": "string",
},
hyper_parameters={
"string": "string",
},
infra_check_config={
"enable_infra_check": False,
},
input_data_configs=[{
"channel_name": "string",
"compression_type": "string",
"content_type": "string",
"data_source": {
"file_system_data_source": {
"directory_path": "string",
"file_system_access_mode": "string",
"file_system_id": "string",
"file_system_type": "string",
},
"s3_data_source": {
"s3_data_type": "string",
"s3_uri": "string",
"attribute_names": ["string"],
"hub_access_config": {
"hub_content_arn": "string",
},
"instance_group_names": ["string"],
"model_access_config": {
"accept_eula": False,
},
"s3_data_distribution_type": "string",
},
},
"input_mode": "string",
"record_wrapper_type": "string",
"shuffle_config": {
"seed": 0,
},
}],
mlflow_config={
"mlflow_resource_arn": "string",
"mlflow_experiment_name": "string",
"mlflow_run_name": "string",
},
model_package_config={
"model_package_group_arn": "string",
"source_model_package_arn": "string",
},
debug_rule_configurations=[{
"rule_configuration_name": "string",
"rule_evaluator_image": "string",
"instance_type": "string",
"local_path": "string",
"rule_parameters": {
"string": "string",
},
"s3_output_path": "string",
"volume_size_in_gb": 0,
}],
algorithm_specification={
"algorithm_name": "string",
"container_arguments": ["string"],
"container_entrypoints": ["string"],
"enable_sagemaker_metrics_time_series": False,
"metric_definitions": [{
"name": "string",
"regex": "string",
}],
"training_image": "string",
"training_image_config": {
"training_repository_access_mode": "string",
"training_repository_auth_config": {
"training_repository_credentials_provider_arn": "string",
},
},
"training_input_mode": "string",
},
serverless_job_config={
"base_model_arn": "string",
"job_type": "string",
"accept_eula": False,
"customization_technique": "string",
"evaluation_type": "string",
"evaluator_arn": "string",
"peft": "string",
},
region="string",
remote_debug_config={
"enable_remote_debug": False,
},
resource_config={
"instance_count": 0,
"instance_groups": [{
"instance_count": 0,
"instance_group_name": "string",
"instance_type": "string",
}],
"instance_placement_config": {
"enable_multiple_jobs": False,
"placement_specifications": [{
"instance_count": 0,
"ultra_server_id": "string",
}],
},
"instance_type": "string",
"keep_alive_period_in_seconds": 0,
"training_plan_arn": "string",
"volume_kms_key_id": "string",
"volume_size_in_gb": 0,
},
profiler_config={
"disable_profiler": False,
"profiling_interval_in_milliseconds": 0,
"profiling_parameters": {
"string": "string",
},
"s3_output_path": "string",
},
debug_hook_config={
"s3_output_path": "string",
"collection_configurations": [{
"collection_name": "string",
"collection_parameters": {
"string": "string",
},
}],
"hook_parameters": {
"string": "string",
},
"local_path": "string",
},
profiler_rule_configurations=[{
"rule_configuration_name": "string",
"rule_evaluator_image": "string",
"instance_type": "string",
"local_path": "string",
"rule_parameters": {
"string": "string",
},
"s3_output_path": "string",
"volume_size_in_gb": 0,
}],
session_chaining_config={
"enable_session_tag_chaining": False,
},
stopping_condition={
"max_pending_time_in_seconds": 0,
"max_runtime_in_seconds": 0,
"max_wait_time_in_seconds": 0,
},
tags={
"string": "string",
},
tensor_board_output_config={
"s3_output_path": "string",
"local_path": "string",
},
timeouts={
"create": "string",
"delete": "string",
"update": "string",
},
checkpoint_config={
"s3_uri": "string",
"local_path": "string",
},
vpc_config={
"security_group_ids": ["string"],
"subnets": ["string"],
})
const trainingJobResource = new aws.sagemaker.TrainingJob("trainingJobResource", {
roleArn: "string",
trainingJobName: "string",
outputDataConfig: {
s3OutputPath: "string",
compressionType: "string",
kmsKeyId: "string",
},
retryStrategy: {
maximumRetryAttempts: 0,
},
deleteModelPackagesOnDestroy: false,
deleteVpcEnisOnDestroy: false,
enableInterContainerTrafficEncryption: false,
enableManagedSpotTraining: false,
enableNetworkIsolation: false,
environment: {
string: "string",
},
experimentConfig: {
experimentName: "string",
runName: "string",
trialComponentDisplayName: "string",
trialName: "string",
},
hyperParameters: {
string: "string",
},
infraCheckConfig: {
enableInfraCheck: false,
},
inputDataConfigs: [{
channelName: "string",
compressionType: "string",
contentType: "string",
dataSource: {
fileSystemDataSource: {
directoryPath: "string",
fileSystemAccessMode: "string",
fileSystemId: "string",
fileSystemType: "string",
},
s3DataSource: {
s3DataType: "string",
s3Uri: "string",
attributeNames: ["string"],
hubAccessConfig: {
hubContentArn: "string",
},
instanceGroupNames: ["string"],
modelAccessConfig: {
acceptEula: false,
},
s3DataDistributionType: "string",
},
},
inputMode: "string",
recordWrapperType: "string",
shuffleConfig: {
seed: 0,
},
}],
mlflowConfig: {
mlflowResourceArn: "string",
mlflowExperimentName: "string",
mlflowRunName: "string",
},
modelPackageConfig: {
modelPackageGroupArn: "string",
sourceModelPackageArn: "string",
},
debugRuleConfigurations: [{
ruleConfigurationName: "string",
ruleEvaluatorImage: "string",
instanceType: "string",
localPath: "string",
ruleParameters: {
string: "string",
},
s3OutputPath: "string",
volumeSizeInGb: 0,
}],
algorithmSpecification: {
algorithmName: "string",
containerArguments: ["string"],
containerEntrypoints: ["string"],
enableSagemakerMetricsTimeSeries: false,
metricDefinitions: [{
name: "string",
regex: "string",
}],
trainingImage: "string",
trainingImageConfig: {
trainingRepositoryAccessMode: "string",
trainingRepositoryAuthConfig: {
trainingRepositoryCredentialsProviderArn: "string",
},
},
trainingInputMode: "string",
},
serverlessJobConfig: {
baseModelArn: "string",
jobType: "string",
acceptEula: false,
customizationTechnique: "string",
evaluationType: "string",
evaluatorArn: "string",
peft: "string",
},
region: "string",
remoteDebugConfig: {
enableRemoteDebug: false,
},
resourceConfig: {
instanceCount: 0,
instanceGroups: [{
instanceCount: 0,
instanceGroupName: "string",
instanceType: "string",
}],
instancePlacementConfig: {
enableMultipleJobs: false,
placementSpecifications: [{
instanceCount: 0,
ultraServerId: "string",
}],
},
instanceType: "string",
keepAlivePeriodInSeconds: 0,
trainingPlanArn: "string",
volumeKmsKeyId: "string",
volumeSizeInGb: 0,
},
profilerConfig: {
disableProfiler: false,
profilingIntervalInMilliseconds: 0,
profilingParameters: {
string: "string",
},
s3OutputPath: "string",
},
debugHookConfig: {
s3OutputPath: "string",
collectionConfigurations: [{
collectionName: "string",
collectionParameters: {
string: "string",
},
}],
hookParameters: {
string: "string",
},
localPath: "string",
},
profilerRuleConfigurations: [{
ruleConfigurationName: "string",
ruleEvaluatorImage: "string",
instanceType: "string",
localPath: "string",
ruleParameters: {
string: "string",
},
s3OutputPath: "string",
volumeSizeInGb: 0,
}],
sessionChainingConfig: {
enableSessionTagChaining: false,
},
stoppingCondition: {
maxPendingTimeInSeconds: 0,
maxRuntimeInSeconds: 0,
maxWaitTimeInSeconds: 0,
},
tags: {
string: "string",
},
tensorBoardOutputConfig: {
s3OutputPath: "string",
localPath: "string",
},
timeouts: {
create: "string",
"delete": "string",
update: "string",
},
checkpointConfig: {
s3Uri: "string",
localPath: "string",
},
vpcConfig: {
securityGroupIds: ["string"],
subnets: ["string"],
},
});
type: aws:sagemaker:TrainingJob
properties:
algorithmSpecification:
algorithmName: string
containerArguments:
- string
containerEntrypoints:
- string
enableSagemakerMetricsTimeSeries: false
metricDefinitions:
- name: string
regex: string
trainingImage: string
trainingImageConfig:
trainingRepositoryAccessMode: string
trainingRepositoryAuthConfig:
trainingRepositoryCredentialsProviderArn: string
trainingInputMode: string
checkpointConfig:
localPath: string
s3Uri: string
debugHookConfig:
collectionConfigurations:
- collectionName: string
collectionParameters:
string: string
hookParameters:
string: string
localPath: string
s3OutputPath: string
debugRuleConfigurations:
- instanceType: string
localPath: string
ruleConfigurationName: string
ruleEvaluatorImage: string
ruleParameters:
string: string
s3OutputPath: string
volumeSizeInGb: 0
deleteModelPackagesOnDestroy: false
deleteVpcEnisOnDestroy: false
enableInterContainerTrafficEncryption: false
enableManagedSpotTraining: false
enableNetworkIsolation: false
environment:
string: string
experimentConfig:
experimentName: string
runName: string
trialComponentDisplayName: string
trialName: string
hyperParameters:
string: string
infraCheckConfig:
enableInfraCheck: false
inputDataConfigs:
- channelName: string
compressionType: string
contentType: string
dataSource:
fileSystemDataSource:
directoryPath: string
fileSystemAccessMode: string
fileSystemId: string
fileSystemType: string
s3DataSource:
attributeNames:
- string
hubAccessConfig:
hubContentArn: string
instanceGroupNames:
- string
modelAccessConfig:
acceptEula: false
s3DataDistributionType: string
s3DataType: string
s3Uri: string
inputMode: string
recordWrapperType: string
shuffleConfig:
seed: 0
mlflowConfig:
mlflowExperimentName: string
mlflowResourceArn: string
mlflowRunName: string
modelPackageConfig:
modelPackageGroupArn: string
sourceModelPackageArn: string
outputDataConfig:
compressionType: string
kmsKeyId: string
s3OutputPath: string
profilerConfig:
disableProfiler: false
profilingIntervalInMilliseconds: 0
profilingParameters:
string: string
s3OutputPath: string
profilerRuleConfigurations:
- instanceType: string
localPath: string
ruleConfigurationName: string
ruleEvaluatorImage: string
ruleParameters:
string: string
s3OutputPath: string
volumeSizeInGb: 0
region: string
remoteDebugConfig:
enableRemoteDebug: false
resourceConfig:
instanceCount: 0
instanceGroups:
- instanceCount: 0
instanceGroupName: string
instanceType: string
instancePlacementConfig:
enableMultipleJobs: false
placementSpecifications:
- instanceCount: 0
ultraServerId: string
instanceType: string
keepAlivePeriodInSeconds: 0
trainingPlanArn: string
volumeKmsKeyId: string
volumeSizeInGb: 0
retryStrategy:
maximumRetryAttempts: 0
roleArn: string
serverlessJobConfig:
acceptEula: false
baseModelArn: string
customizationTechnique: string
evaluationType: string
evaluatorArn: string
jobType: string
peft: string
sessionChainingConfig:
enableSessionTagChaining: false
stoppingCondition:
maxPendingTimeInSeconds: 0
maxRuntimeInSeconds: 0
maxWaitTimeInSeconds: 0
tags:
string: string
tensorBoardOutputConfig:
localPath: string
s3OutputPath: string
timeouts:
create: string
delete: string
update: string
trainingJobName: string
vpcConfig:
securityGroupIds:
- string
subnets:
- string
TrainingJob Resource Properties
To learn more about resource properties and how to use them, see Inputs and Outputs in the Architecture and Concepts docs.
Inputs
In Python, inputs that are objects can be passed either as argument classes or as dictionary literals.
The TrainingJob resource accepts the following input properties:
- Role
Arn string - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- Training
Job stringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- Algorithm
Specification TrainingJob Algorithm Specification - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - Checkpoint
Config TrainingJob Checkpoint Config - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - Debug
Hook TrainingConfig Job Debug Hook Config - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - Debug
Rule List<TrainingConfigurations Job Debug Rule Configuration> - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - Delete
Model boolPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - Delete
Vpc boolEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - Enable
Inter boolContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- Enable
Managed boolSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - Enable
Network boolIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- Environment Dictionary<string, string>
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - Experiment
Config TrainingJob Experiment Config - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - Hyper
Parameters Dictionary<string, string> - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- Infra
Check TrainingConfig Job Infra Check Config - Infrastructure health check configuration. See
infraCheckConfigbelow. - Input
Data List<TrainingConfigs Job Input Data Config> - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - Mlflow
Config TrainingJob Mlflow Config - MLflow integration configuration. See
mlflowConfigbelow. - Model
Package TrainingConfig Job Model Package Config - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - Output
Data TrainingConfig Job Output Data Config Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- Profiler
Config TrainingJob Profiler Config - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - Profiler
Rule List<TrainingConfigurations Job Profiler Rule Configuration> - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - Region string
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- Remote
Debug TrainingConfig Job Remote Debug Config - Configuration for remote debugging. See
remoteDebugConfigbelow. - Resource
Config TrainingJob Resource Config - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - Retry
Strategy TrainingJob Retry Strategy - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - Serverless
Job TrainingConfig Job Serverless Job Config - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - Session
Chaining TrainingConfig Job Session Chaining Config - Configuration for session tag chaining. See
sessionChainingConfigbelow. - Stopping
Condition TrainingJob Stopping Condition - Dictionary<string, string>
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - Tensor
Board TrainingOutput Config Job Tensor Board Output Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - Timeouts
Training
Job Timeouts - Vpc
Config TrainingJob Vpc Config - VPC configuration for the training job. See
vpcConfigbelow.
- Role
Arn string - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- Training
Job stringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- Algorithm
Specification TrainingJob Algorithm Specification Args - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - Checkpoint
Config TrainingJob Checkpoint Config Args - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - Debug
Hook TrainingConfig Job Debug Hook Config Args - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - Debug
Rule []TrainingConfigurations Job Debug Rule Configuration Args - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - Delete
Model boolPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - Delete
Vpc boolEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - Enable
Inter boolContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- Enable
Managed boolSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - Enable
Network boolIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- Environment map[string]string
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - Experiment
Config TrainingJob Experiment Config Args - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - Hyper
Parameters map[string]string - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- Infra
Check TrainingConfig Job Infra Check Config Args - Infrastructure health check configuration. See
infraCheckConfigbelow. - Input
Data []TrainingConfigs Job Input Data Config Args - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - Mlflow
Config TrainingJob Mlflow Config Args - MLflow integration configuration. See
mlflowConfigbelow. - Model
Package TrainingConfig Job Model Package Config Args - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - Output
Data TrainingConfig Job Output Data Config Args Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- Profiler
Config TrainingJob Profiler Config Args - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - Profiler
Rule []TrainingConfigurations Job Profiler Rule Configuration Args - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - Region string
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- Remote
Debug TrainingConfig Job Remote Debug Config Args - Configuration for remote debugging. See
remoteDebugConfigbelow. - Resource
Config TrainingJob Resource Config Args - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - Retry
Strategy TrainingJob Retry Strategy Args - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - Serverless
Job TrainingConfig Job Serverless Job Config Args - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - Session
Chaining TrainingConfig Job Session Chaining Config Args - Configuration for session tag chaining. See
sessionChainingConfigbelow. - Stopping
Condition TrainingJob Stopping Condition Args - map[string]string
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - Tensor
Board TrainingOutput Config Job Tensor Board Output Config Args - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - Timeouts
Training
Job Timeouts Args - Vpc
Config TrainingJob Vpc Config Args - VPC configuration for the training job. See
vpcConfigbelow.
- role
Arn String - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- training
Job StringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- algorithm
Specification TrainingJob Algorithm Specification - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - checkpoint
Config TrainingJob Checkpoint Config - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug
Hook TrainingConfig Job Debug Hook Config - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug
Rule List<TrainingConfigurations Job Debug Rule Configuration> - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete
Model BooleanPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete
Vpc BooleanEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable
Inter BooleanContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable
Managed BooleanSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable
Network BooleanIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment Map<String,String>
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment
Config TrainingJob Experiment Config - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper
Parameters Map<String,String> - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra
Check TrainingConfig Job Infra Check Config - Infrastructure health check configuration. See
infraCheckConfigbelow. - input
Data List<TrainingConfigs Job Input Data Config> - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow
Config TrainingJob Mlflow Config - MLflow integration configuration. See
mlflowConfigbelow. - model
Package TrainingConfig Job Model Package Config - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output
Data TrainingConfig Job Output Data Config Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler
Config TrainingJob Profiler Config - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler
Rule List<TrainingConfigurations Job Profiler Rule Configuration> - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region String
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote
Debug TrainingConfig Job Remote Debug Config - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource
Config TrainingJob Resource Config - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry
Strategy TrainingJob Retry Strategy - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - serverless
Job TrainingConfig Job Serverless Job Config - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session
Chaining TrainingConfig Job Session Chaining Config - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping
Condition TrainingJob Stopping Condition - Map<String,String>
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - tensor
Board TrainingOutput Config Job Tensor Board Output Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts
Training
Job Timeouts - vpc
Config TrainingJob Vpc Config - VPC configuration for the training job. See
vpcConfigbelow.
- role
Arn string - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- training
Job stringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- algorithm
Specification TrainingJob Algorithm Specification - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - checkpoint
Config TrainingJob Checkpoint Config - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug
Hook TrainingConfig Job Debug Hook Config - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug
Rule TrainingConfigurations Job Debug Rule Configuration[] - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete
Model booleanPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete
Vpc booleanEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable
Inter booleanContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable
Managed booleanSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable
Network booleanIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment {[key: string]: string}
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment
Config TrainingJob Experiment Config - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper
Parameters {[key: string]: string} - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra
Check TrainingConfig Job Infra Check Config - Infrastructure health check configuration. See
infraCheckConfigbelow. - input
Data TrainingConfigs Job Input Data Config[] - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow
Config TrainingJob Mlflow Config - MLflow integration configuration. See
mlflowConfigbelow. - model
Package TrainingConfig Job Model Package Config - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output
Data TrainingConfig Job Output Data Config Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler
Config TrainingJob Profiler Config - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler
Rule TrainingConfigurations Job Profiler Rule Configuration[] - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region string
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote
Debug TrainingConfig Job Remote Debug Config - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource
Config TrainingJob Resource Config - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry
Strategy TrainingJob Retry Strategy - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - serverless
Job TrainingConfig Job Serverless Job Config - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session
Chaining TrainingConfig Job Session Chaining Config - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping
Condition TrainingJob Stopping Condition - {[key: string]: string}
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - tensor
Board TrainingOutput Config Job Tensor Board Output Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts
Training
Job Timeouts - vpc
Config TrainingJob Vpc Config - VPC configuration for the training job. See
vpcConfigbelow.
- role_
arn str - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- training_
job_ strname - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- algorithm_
specification TrainingJob Algorithm Specification Args - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - checkpoint_
config TrainingJob Checkpoint Config Args - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug_
hook_ Trainingconfig Job Debug Hook Config Args - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug_
rule_ Sequence[Trainingconfigurations Job Debug Rule Configuration Args] - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete_
model_ boolpackages_ on_ destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete_
vpc_ boolenis_ on_ destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable_
inter_ boolcontainer_ traffic_ encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable_
managed_ boolspot_ training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable_
network_ boolisolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment Mapping[str, str]
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment_
config TrainingJob Experiment Config Args - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper_
parameters Mapping[str, str] - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra_
check_ Trainingconfig Job Infra Check Config Args - Infrastructure health check configuration. See
infraCheckConfigbelow. - input_
data_ Sequence[Trainingconfigs Job Input Data Config Args] - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow_
config TrainingJob Mlflow Config Args - MLflow integration configuration. See
mlflowConfigbelow. - model_
package_ Trainingconfig Job Model Package Config Args - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output_
data_ Trainingconfig Job Output Data Config Args Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler_
config TrainingJob Profiler Config Args - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler_
rule_ Sequence[Trainingconfigurations Job Profiler Rule Configuration Args] - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region str
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote_
debug_ Trainingconfig Job Remote Debug Config Args - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource_
config TrainingJob Resource Config Args - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry_
strategy TrainingJob Retry Strategy Args - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - serverless_
job_ Trainingconfig Job Serverless Job Config Args - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session_
chaining_ Trainingconfig Job Session Chaining Config Args - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping_
condition TrainingJob Stopping Condition Args - Mapping[str, str]
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - tensor_
board_ Trainingoutput_ config Job Tensor Board Output Config Args - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts
Training
Job Timeouts Args - vpc_
config TrainingJob Vpc Config Args - VPC configuration for the training job. See
vpcConfigbelow.
- role
Arn String - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- training
Job StringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- algorithm
Specification Property Map - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - checkpoint
Config Property Map - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug
Hook Property MapConfig - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug
Rule List<Property Map>Configurations - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete
Model BooleanPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete
Vpc BooleanEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable
Inter BooleanContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable
Managed BooleanSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable
Network BooleanIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment Map<String>
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment
Config Property Map - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper
Parameters Map<String> - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra
Check Property MapConfig - Infrastructure health check configuration. See
infraCheckConfigbelow. - input
Data List<Property Map>Configs - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow
Config Property Map - MLflow integration configuration. See
mlflowConfigbelow. - model
Package Property MapConfig - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output
Data Property MapConfig Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler
Config Property Map - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler
Rule List<Property Map>Configurations - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region String
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote
Debug Property MapConfig - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource
Config Property Map - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry
Strategy Property Map - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - serverless
Job Property MapConfig - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session
Chaining Property MapConfig - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping
Condition Property Map - Map<String>
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - tensor
Board Property MapOutput Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts Property Map
- vpc
Config Property Map - VPC configuration for the training job. See
vpcConfigbelow.
Outputs
All input properties are implicitly available as output properties. Additionally, the TrainingJob resource produces the following output properties:
Look up Existing TrainingJob Resource
Get an existing TrainingJob resource’s state with the given name, ID, and optional extra properties used to qualify the lookup.
public static get(name: string, id: Input<ID>, state?: TrainingJobState, opts?: CustomResourceOptions): TrainingJob@staticmethod
def get(resource_name: str,
id: str,
opts: Optional[ResourceOptions] = None,
algorithm_specification: Optional[TrainingJobAlgorithmSpecificationArgs] = None,
arn: Optional[str] = None,
checkpoint_config: Optional[TrainingJobCheckpointConfigArgs] = None,
debug_hook_config: Optional[TrainingJobDebugHookConfigArgs] = None,
debug_rule_configurations: Optional[Sequence[TrainingJobDebugRuleConfigurationArgs]] = None,
delete_model_packages_on_destroy: Optional[bool] = None,
delete_vpc_enis_on_destroy: Optional[bool] = None,
enable_inter_container_traffic_encryption: Optional[bool] = None,
enable_managed_spot_training: Optional[bool] = None,
enable_network_isolation: Optional[bool] = None,
environment: Optional[Mapping[str, str]] = None,
experiment_config: Optional[TrainingJobExperimentConfigArgs] = None,
hyper_parameters: Optional[Mapping[str, str]] = None,
infra_check_config: Optional[TrainingJobInfraCheckConfigArgs] = None,
input_data_configs: Optional[Sequence[TrainingJobInputDataConfigArgs]] = None,
mlflow_config: Optional[TrainingJobMlflowConfigArgs] = None,
model_package_config: Optional[TrainingJobModelPackageConfigArgs] = None,
output_data_config: Optional[TrainingJobOutputDataConfigArgs] = None,
profiler_config: Optional[TrainingJobProfilerConfigArgs] = None,
profiler_rule_configurations: Optional[Sequence[TrainingJobProfilerRuleConfigurationArgs]] = None,
region: Optional[str] = None,
remote_debug_config: Optional[TrainingJobRemoteDebugConfigArgs] = None,
resource_config: Optional[TrainingJobResourceConfigArgs] = None,
retry_strategy: Optional[TrainingJobRetryStrategyArgs] = None,
role_arn: Optional[str] = None,
serverless_job_config: Optional[TrainingJobServerlessJobConfigArgs] = None,
session_chaining_config: Optional[TrainingJobSessionChainingConfigArgs] = None,
stopping_condition: Optional[TrainingJobStoppingConditionArgs] = None,
tags: Optional[Mapping[str, str]] = None,
tags_all: Optional[Mapping[str, str]] = None,
tensor_board_output_config: Optional[TrainingJobTensorBoardOutputConfigArgs] = None,
timeouts: Optional[TrainingJobTimeoutsArgs] = None,
training_job_name: Optional[str] = None,
vpc_config: Optional[TrainingJobVpcConfigArgs] = None) -> TrainingJobfunc GetTrainingJob(ctx *Context, name string, id IDInput, state *TrainingJobState, opts ...ResourceOption) (*TrainingJob, error)public static TrainingJob Get(string name, Input<string> id, TrainingJobState? state, CustomResourceOptions? opts = null)public static TrainingJob get(String name, Output<String> id, TrainingJobState state, CustomResourceOptions options)resources: _: type: aws:sagemaker:TrainingJob get: id: ${id}- name
- The unique name of the resulting resource.
- id
- The unique provider ID of the resource to lookup.
- state
- Any extra arguments used during the lookup.
- opts
- A bag of options that control this resource's behavior.
- resource_name
- The unique name of the resulting resource.
- id
- The unique provider ID of the resource to lookup.
- name
- The unique name of the resulting resource.
- id
- The unique provider ID of the resource to lookup.
- state
- Any extra arguments used during the lookup.
- opts
- A bag of options that control this resource's behavior.
- name
- The unique name of the resulting resource.
- id
- The unique provider ID of the resource to lookup.
- state
- Any extra arguments used during the lookup.
- opts
- A bag of options that control this resource's behavior.
- name
- The unique name of the resulting resource.
- id
- The unique provider ID of the resource to lookup.
- state
- Any extra arguments used during the lookup.
- opts
- A bag of options that control this resource's behavior.
- Algorithm
Specification TrainingJob Algorithm Specification - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - Arn string
- ARN of the Training Job.
- Checkpoint
Config TrainingJob Checkpoint Config - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - Debug
Hook TrainingConfig Job Debug Hook Config - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - Debug
Rule List<TrainingConfigurations Job Debug Rule Configuration> - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - Delete
Model boolPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - Delete
Vpc boolEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - Enable
Inter boolContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- Enable
Managed boolSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - Enable
Network boolIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- Environment Dictionary<string, string>
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - Experiment
Config TrainingJob Experiment Config - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - Hyper
Parameters Dictionary<string, string> - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- Infra
Check TrainingConfig Job Infra Check Config - Infrastructure health check configuration. See
infraCheckConfigbelow. - Input
Data List<TrainingConfigs Job Input Data Config> - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - Mlflow
Config TrainingJob Mlflow Config - MLflow integration configuration. See
mlflowConfigbelow. - Model
Package TrainingConfig Job Model Package Config - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - Output
Data TrainingConfig Job Output Data Config Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- Profiler
Config TrainingJob Profiler Config - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - Profiler
Rule List<TrainingConfigurations Job Profiler Rule Configuration> - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - Region string
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- Remote
Debug TrainingConfig Job Remote Debug Config - Configuration for remote debugging. See
remoteDebugConfigbelow. - Resource
Config TrainingJob Resource Config - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - Retry
Strategy TrainingJob Retry Strategy - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - Role
Arn string - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- Serverless
Job TrainingConfig Job Serverless Job Config - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - Session
Chaining TrainingConfig Job Session Chaining Config - Configuration for session tag chaining. See
sessionChainingConfigbelow. - Stopping
Condition TrainingJob Stopping Condition - Dictionary<string, string>
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - Dictionary<string, string>
- Map of tags assigned to the resource, including those inherited from the provider
defaultTagsconfiguration block. - Tensor
Board TrainingOutput Config Job Tensor Board Output Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - Timeouts
Training
Job Timeouts - Training
Job stringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- Vpc
Config TrainingJob Vpc Config - VPC configuration for the training job. See
vpcConfigbelow.
- Algorithm
Specification TrainingJob Algorithm Specification Args - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - Arn string
- ARN of the Training Job.
- Checkpoint
Config TrainingJob Checkpoint Config Args - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - Debug
Hook TrainingConfig Job Debug Hook Config Args - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - Debug
Rule []TrainingConfigurations Job Debug Rule Configuration Args - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - Delete
Model boolPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - Delete
Vpc boolEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - Enable
Inter boolContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- Enable
Managed boolSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - Enable
Network boolIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- Environment map[string]string
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - Experiment
Config TrainingJob Experiment Config Args - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - Hyper
Parameters map[string]string - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- Infra
Check TrainingConfig Job Infra Check Config Args - Infrastructure health check configuration. See
infraCheckConfigbelow. - Input
Data []TrainingConfigs Job Input Data Config Args - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - Mlflow
Config TrainingJob Mlflow Config Args - MLflow integration configuration. See
mlflowConfigbelow. - Model
Package TrainingConfig Job Model Package Config Args - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - Output
Data TrainingConfig Job Output Data Config Args Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- Profiler
Config TrainingJob Profiler Config Args - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - Profiler
Rule []TrainingConfigurations Job Profiler Rule Configuration Args - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - Region string
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- Remote
Debug TrainingConfig Job Remote Debug Config Args - Configuration for remote debugging. See
remoteDebugConfigbelow. - Resource
Config TrainingJob Resource Config Args - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - Retry
Strategy TrainingJob Retry Strategy Args - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - Role
Arn string - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- Serverless
Job TrainingConfig Job Serverless Job Config Args - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - Session
Chaining TrainingConfig Job Session Chaining Config Args - Configuration for session tag chaining. See
sessionChainingConfigbelow. - Stopping
Condition TrainingJob Stopping Condition Args - map[string]string
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - map[string]string
- Map of tags assigned to the resource, including those inherited from the provider
defaultTagsconfiguration block. - Tensor
Board TrainingOutput Config Job Tensor Board Output Config Args - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - Timeouts
Training
Job Timeouts Args - Training
Job stringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- Vpc
Config TrainingJob Vpc Config Args - VPC configuration for the training job. See
vpcConfigbelow.
- algorithm
Specification TrainingJob Algorithm Specification - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - arn String
- ARN of the Training Job.
- checkpoint
Config TrainingJob Checkpoint Config - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug
Hook TrainingConfig Job Debug Hook Config - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug
Rule List<TrainingConfigurations Job Debug Rule Configuration> - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete
Model BooleanPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete
Vpc BooleanEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable
Inter BooleanContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable
Managed BooleanSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable
Network BooleanIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment Map<String,String>
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment
Config TrainingJob Experiment Config - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper
Parameters Map<String,String> - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra
Check TrainingConfig Job Infra Check Config - Infrastructure health check configuration. See
infraCheckConfigbelow. - input
Data List<TrainingConfigs Job Input Data Config> - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow
Config TrainingJob Mlflow Config - MLflow integration configuration. See
mlflowConfigbelow. - model
Package TrainingConfig Job Model Package Config - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output
Data TrainingConfig Job Output Data Config Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler
Config TrainingJob Profiler Config - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler
Rule List<TrainingConfigurations Job Profiler Rule Configuration> - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region String
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote
Debug TrainingConfig Job Remote Debug Config - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource
Config TrainingJob Resource Config - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry
Strategy TrainingJob Retry Strategy - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - role
Arn String - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- serverless
Job TrainingConfig Job Serverless Job Config - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session
Chaining TrainingConfig Job Session Chaining Config - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping
Condition TrainingJob Stopping Condition - Map<String,String>
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - Map<String,String>
- Map of tags assigned to the resource, including those inherited from the provider
defaultTagsconfiguration block. - tensor
Board TrainingOutput Config Job Tensor Board Output Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts
Training
Job Timeouts - training
Job StringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- vpc
Config TrainingJob Vpc Config - VPC configuration for the training job. See
vpcConfigbelow.
- algorithm
Specification TrainingJob Algorithm Specification - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - arn string
- ARN of the Training Job.
- checkpoint
Config TrainingJob Checkpoint Config - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug
Hook TrainingConfig Job Debug Hook Config - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug
Rule TrainingConfigurations Job Debug Rule Configuration[] - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete
Model booleanPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete
Vpc booleanEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable
Inter booleanContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable
Managed booleanSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable
Network booleanIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment {[key: string]: string}
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment
Config TrainingJob Experiment Config - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper
Parameters {[key: string]: string} - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra
Check TrainingConfig Job Infra Check Config - Infrastructure health check configuration. See
infraCheckConfigbelow. - input
Data TrainingConfigs Job Input Data Config[] - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow
Config TrainingJob Mlflow Config - MLflow integration configuration. See
mlflowConfigbelow. - model
Package TrainingConfig Job Model Package Config - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output
Data TrainingConfig Job Output Data Config Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler
Config TrainingJob Profiler Config - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler
Rule TrainingConfigurations Job Profiler Rule Configuration[] - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region string
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote
Debug TrainingConfig Job Remote Debug Config - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource
Config TrainingJob Resource Config - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry
Strategy TrainingJob Retry Strategy - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - role
Arn string - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- serverless
Job TrainingConfig Job Serverless Job Config - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session
Chaining TrainingConfig Job Session Chaining Config - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping
Condition TrainingJob Stopping Condition - {[key: string]: string}
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - {[key: string]: string}
- Map of tags assigned to the resource, including those inherited from the provider
defaultTagsconfiguration block. - tensor
Board TrainingOutput Config Job Tensor Board Output Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts
Training
Job Timeouts - training
Job stringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- vpc
Config TrainingJob Vpc Config - VPC configuration for the training job. See
vpcConfigbelow.
- algorithm_
specification TrainingJob Algorithm Specification Args - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - arn str
- ARN of the Training Job.
- checkpoint_
config TrainingJob Checkpoint Config Args - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug_
hook_ Trainingconfig Job Debug Hook Config Args - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug_
rule_ Sequence[Trainingconfigurations Job Debug Rule Configuration Args] - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete_
model_ boolpackages_ on_ destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete_
vpc_ boolenis_ on_ destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable_
inter_ boolcontainer_ traffic_ encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable_
managed_ boolspot_ training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable_
network_ boolisolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment Mapping[str, str]
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment_
config TrainingJob Experiment Config Args - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper_
parameters Mapping[str, str] - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra_
check_ Trainingconfig Job Infra Check Config Args - Infrastructure health check configuration. See
infraCheckConfigbelow. - input_
data_ Sequence[Trainingconfigs Job Input Data Config Args] - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow_
config TrainingJob Mlflow Config Args - MLflow integration configuration. See
mlflowConfigbelow. - model_
package_ Trainingconfig Job Model Package Config Args - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output_
data_ Trainingconfig Job Output Data Config Args Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler_
config TrainingJob Profiler Config Args - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler_
rule_ Sequence[Trainingconfigurations Job Profiler Rule Configuration Args] - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region str
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote_
debug_ Trainingconfig Job Remote Debug Config Args - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource_
config TrainingJob Resource Config Args - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry_
strategy TrainingJob Retry Strategy Args - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - role_
arn str - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- serverless_
job_ Trainingconfig Job Serverless Job Config Args - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session_
chaining_ Trainingconfig Job Session Chaining Config Args - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping_
condition TrainingJob Stopping Condition Args - Mapping[str, str]
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - Mapping[str, str]
- Map of tags assigned to the resource, including those inherited from the provider
defaultTagsconfiguration block. - tensor_
board_ Trainingoutput_ config Job Tensor Board Output Config Args - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts
Training
Job Timeouts Args - training_
job_ strname - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- vpc_
config TrainingJob Vpc Config Args - VPC configuration for the training job. See
vpcConfigbelow.
- algorithm
Specification Property Map - Algorithm-related parameters of the training job. See
algorithmSpecificationbelow. Conflicts withserverlessJobConfig. - arn String
- ARN of the Training Job.
- checkpoint
Config Property Map - Location of checkpoints during training. See
checkpointConfigbelow. Conflicts withserverlessJobConfig. - debug
Hook Property MapConfig - Configuration for debugging rules. See
debugHookConfigbelow. Conflicts withserverlessJobConfig. - debug
Rule List<Property Map>Configurations - List of debug rule configurations. Maximum of 20. See
debugRuleConfigurationsbelow. - delete
Model BooleanPackages On Destroy - Whether to delete model packages in the configured model package group when the training job is destroyed. Default is
false. - delete
Vpc BooleanEnis On Destroy - Whether to delete detached VPC ENIs SageMaker may leave behind when the training job is destroyed. Default is
false. - enable
Inter BooleanContainer Traffic Encryption - Whether to encrypt inter-container traffic. When enabled, communications between containers are encrypted.
- enable
Managed BooleanSpot Training - Whether to use managed spot training. Optimizes the cost of training by using Amazon EC2 Spot Instances. Conflicts with
serverlessJobConfig. - enable
Network BooleanIsolation - Whether to isolate the training container from the network. No inbound or outbound network calls can be made.
- environment Map<String>
- Map of environment variables to set in the training container. Maximum of 100 entries. Conflicts with
serverlessJobConfig. - experiment
Config Property Map - Associates a SageMaker AI Experiment or Trial to the training job. See
experimentConfigbelow. Conflicts withserverlessJobConfig. - hyper
Parameters Map<String> - Map of hyperparameters for the training algorithm. Maximum of 100 entries.
- infra
Check Property MapConfig - Infrastructure health check configuration. See
infraCheckConfigbelow. - input
Data List<Property Map>Configs - List of input data channel configurations for the training job. Maximum of 20. See
inputDataConfigbelow. - mlflow
Config Property Map - MLflow integration configuration. See
mlflowConfigbelow. - model
Package Property MapConfig - Model package configuration. Requires
serverlessJobConfig. SeemodelPackageConfigbelow. - output
Data Property MapConfig Location of the output data from the training job. See
outputDataConfigbelow.The following arguments are optional:
- profiler
Config Property Map - Configuration for the profiler. See
profilerConfigbelow. Conflicts withserverlessJobConfig. - profiler
Rule List<Property Map>Configurations - List of profiler rule configurations. Maximum of 20. See
profilerRuleConfigurationsbelow. Conflicts withserverlessJobConfig. - region String
- Region where this resource will be managed. Defaults to the Region set in the provider configuration.
- remote
Debug Property MapConfig - Configuration for remote debugging. See
remoteDebugConfigbelow. - resource
Config Property Map - Resources for the training job, including compute instances and storage volumes. See
resourceConfigbelow. - retry
Strategy Property Map - Number of times to retry the job if it fails. See
retryStrategybelow. Conflicts withserverlessJobConfig. - role
Arn String - ARN of the IAM role that SageMaker AI assumes to perform tasks on your behalf during training.
- serverless
Job Property MapConfig - Configuration for serverless training jobs using foundation models. Conflicts with
algorithmSpecification,enableManagedSpotTraining,environment,retryStrategy,checkpointConfig,debugHookConfig,experimentConfig,profilerConfig,profilerRuleConfigurations, andtensorBoardOutputConfig. SeeserverlessJobConfigbelow. - session
Chaining Property MapConfig - Configuration for session tag chaining. See
sessionChainingConfigbelow. - stopping
Condition Property Map - Map<String>
- Map of tags to assign to the resource. If configured with a provider
defaultTagsconfiguration block present, tags with matching keys will overwrite those defined at the provider-level. - Map<String>
- Map of tags assigned to the resource, including those inherited from the provider
defaultTagsconfiguration block. - tensor
Board Property MapOutput Config - Configuration for TensorBoard output. See
tensorBoardOutputConfigbelow. Conflicts withserverlessJobConfig. - timeouts Property Map
- training
Job StringName - Name of the training job. Must be between 1 and 63 characters, start with a letter or number, and contain only letters, numbers, and hyphens.
- vpc
Config Property Map - VPC configuration for the training job. See
vpcConfigbelow.
Supporting Types
TrainingJobAlgorithmSpecification, TrainingJobAlgorithmSpecificationArgs
- Algorithm
Name string - Name or ARN of the algorithm resource to use for the training job.
- Container
Arguments List<string> - List of arguments for the container entrypoint. Maximum of 100 entries.
- Container
Entrypoints List<string> - List of entrypoint commands for the container. Maximum of 100 entries.
- Enable
Sagemaker boolMetrics Time Series - Whether to enable SageMaker AI metrics time series collection.
- Metric
Definitions List<TrainingJob Algorithm Specification Metric Definition> - List of metric definitions for the training job. Maximum of 40. Use this to extract custom metrics from your own training container logs. SageMaker can still publish built-in metrics for built-in algorithms and supported prebuilt images when this block is omitted. See
metricDefinitionsbelow. - Training
Image string - Registry path of the Docker image that contains the training algorithm.
- Training
Image TrainingConfig Job Algorithm Specification Training Image Config - Training image configuration. See
trainingImageConfigbelow. - Training
Input stringMode - Input mode for the training data. Valid values:
File,Pipe,FastFile.
- Algorithm
Name string - Name or ARN of the algorithm resource to use for the training job.
- Container
Arguments []string - List of arguments for the container entrypoint. Maximum of 100 entries.
- Container
Entrypoints []string - List of entrypoint commands for the container. Maximum of 100 entries.
- Enable
Sagemaker boolMetrics Time Series - Whether to enable SageMaker AI metrics time series collection.
- Metric
Definitions []TrainingJob Algorithm Specification Metric Definition - List of metric definitions for the training job. Maximum of 40. Use this to extract custom metrics from your own training container logs. SageMaker can still publish built-in metrics for built-in algorithms and supported prebuilt images when this block is omitted. See
metricDefinitionsbelow. - Training
Image string - Registry path of the Docker image that contains the training algorithm.
- Training
Image TrainingConfig Job Algorithm Specification Training Image Config - Training image configuration. See
trainingImageConfigbelow. - Training
Input stringMode - Input mode for the training data. Valid values:
File,Pipe,FastFile.
- algorithm
Name String - Name or ARN of the algorithm resource to use for the training job.
- container
Arguments List<String> - List of arguments for the container entrypoint. Maximum of 100 entries.
- container
Entrypoints List<String> - List of entrypoint commands for the container. Maximum of 100 entries.
- enable
Sagemaker BooleanMetrics Time Series - Whether to enable SageMaker AI metrics time series collection.
- metric
Definitions List<TrainingJob Algorithm Specification Metric Definition> - List of metric definitions for the training job. Maximum of 40. Use this to extract custom metrics from your own training container logs. SageMaker can still publish built-in metrics for built-in algorithms and supported prebuilt images when this block is omitted. See
metricDefinitionsbelow. - training
Image String - Registry path of the Docker image that contains the training algorithm.
- training
Image TrainingConfig Job Algorithm Specification Training Image Config - Training image configuration. See
trainingImageConfigbelow. - training
Input StringMode - Input mode for the training data. Valid values:
File,Pipe,FastFile.
- algorithm
Name string - Name or ARN of the algorithm resource to use for the training job.
- container
Arguments string[] - List of arguments for the container entrypoint. Maximum of 100 entries.
- container
Entrypoints string[] - List of entrypoint commands for the container. Maximum of 100 entries.
- enable
Sagemaker booleanMetrics Time Series - Whether to enable SageMaker AI metrics time series collection.
- metric
Definitions TrainingJob Algorithm Specification Metric Definition[] - List of metric definitions for the training job. Maximum of 40. Use this to extract custom metrics from your own training container logs. SageMaker can still publish built-in metrics for built-in algorithms and supported prebuilt images when this block is omitted. See
metricDefinitionsbelow. - training
Image string - Registry path of the Docker image that contains the training algorithm.
- training
Image TrainingConfig Job Algorithm Specification Training Image Config - Training image configuration. See
trainingImageConfigbelow. - training
Input stringMode - Input mode for the training data. Valid values:
File,Pipe,FastFile.
- algorithm_
name str - Name or ARN of the algorithm resource to use for the training job.
- container_
arguments Sequence[str] - List of arguments for the container entrypoint. Maximum of 100 entries.
- container_
entrypoints Sequence[str] - List of entrypoint commands for the container. Maximum of 100 entries.
- enable_
sagemaker_ boolmetrics_ time_ series - Whether to enable SageMaker AI metrics time series collection.
- metric_
definitions Sequence[TrainingJob Algorithm Specification Metric Definition] - List of metric definitions for the training job. Maximum of 40. Use this to extract custom metrics from your own training container logs. SageMaker can still publish built-in metrics for built-in algorithms and supported prebuilt images when this block is omitted. See
metricDefinitionsbelow. - training_
image str - Registry path of the Docker image that contains the training algorithm.
- training_
image_ Trainingconfig Job Algorithm Specification Training Image Config - Training image configuration. See
trainingImageConfigbelow. - training_
input_ strmode - Input mode for the training data. Valid values:
File,Pipe,FastFile.
- algorithm
Name String - Name or ARN of the algorithm resource to use for the training job.
- container
Arguments List<String> - List of arguments for the container entrypoint. Maximum of 100 entries.
- container
Entrypoints List<String> - List of entrypoint commands for the container. Maximum of 100 entries.
- enable
Sagemaker BooleanMetrics Time Series - Whether to enable SageMaker AI metrics time series collection.
- metric
Definitions List<Property Map> - List of metric definitions for the training job. Maximum of 40. Use this to extract custom metrics from your own training container logs. SageMaker can still publish built-in metrics for built-in algorithms and supported prebuilt images when this block is omitted. See
metricDefinitionsbelow. - training
Image String - Registry path of the Docker image that contains the training algorithm.
- training
Image Property MapConfig - Training image configuration. See
trainingImageConfigbelow. - training
Input StringMode - Input mode for the training data. Valid values:
File,Pipe,FastFile.
TrainingJobAlgorithmSpecificationMetricDefinition, TrainingJobAlgorithmSpecificationMetricDefinitionArgs
TrainingJobAlgorithmSpecificationTrainingImageConfig, TrainingJobAlgorithmSpecificationTrainingImageConfigArgs
- Training
Repository stringAccess Mode - Access mode for the training image repository.
- Training
Repository TrainingAuth Config Job Algorithm Specification Training Image Config Training Repository Auth Config - Authentication configuration for the training image repository. See
trainingRepositoryAuthConfigbelow.
- Training
Repository stringAccess Mode - Access mode for the training image repository.
- Training
Repository TrainingAuth Config Job Algorithm Specification Training Image Config Training Repository Auth Config - Authentication configuration for the training image repository. See
trainingRepositoryAuthConfigbelow.
- training
Repository StringAccess Mode - Access mode for the training image repository.
- training
Repository TrainingAuth Config Job Algorithm Specification Training Image Config Training Repository Auth Config - Authentication configuration for the training image repository. See
trainingRepositoryAuthConfigbelow.
- training
Repository stringAccess Mode - Access mode for the training image repository.
- training
Repository TrainingAuth Config Job Algorithm Specification Training Image Config Training Repository Auth Config - Authentication configuration for the training image repository. See
trainingRepositoryAuthConfigbelow.
- training_
repository_ straccess_ mode - Access mode for the training image repository.
- training_
repository_ Trainingauth_ config Job Algorithm Specification Training Image Config Training Repository Auth Config - Authentication configuration for the training image repository. See
trainingRepositoryAuthConfigbelow.
- training
Repository StringAccess Mode - Access mode for the training image repository.
- training
Repository Property MapAuth Config - Authentication configuration for the training image repository. See
trainingRepositoryAuthConfigbelow.
TrainingJobAlgorithmSpecificationTrainingImageConfigTrainingRepositoryAuthConfig, TrainingJobAlgorithmSpecificationTrainingImageConfigTrainingRepositoryAuthConfigArgs
- Training
Repository stringCredentials Provider Arn - ARN of the Lambda function that provides credentials to authenticate to the private Docker registry.
- Training
Repository stringCredentials Provider Arn - ARN of the Lambda function that provides credentials to authenticate to the private Docker registry.
- training
Repository StringCredentials Provider Arn - ARN of the Lambda function that provides credentials to authenticate to the private Docker registry.
- training
Repository stringCredentials Provider Arn - ARN of the Lambda function that provides credentials to authenticate to the private Docker registry.
- training_
repository_ strcredentials_ provider_ arn - ARN of the Lambda function that provides credentials to authenticate to the private Docker registry.
- training
Repository StringCredentials Provider Arn - ARN of the Lambda function that provides credentials to authenticate to the private Docker registry.
TrainingJobCheckpointConfig, TrainingJobCheckpointConfigArgs
- s3_
uri str - S3 URI where checkpoints are stored.
- local_
path str - Local path where checkpoints are written.
TrainingJobDebugHookConfig, TrainingJobDebugHookConfigArgs
- S3Output
Path string - S3 URI where debug output is stored.
- Collection
Configurations List<TrainingJob Debug Hook Config Collection Configuration> - List of tensor collections to configure for the debug hook. Maximum of 20. See
collectionConfigurationsbelow. - Hook
Parameters Dictionary<string, string> - Map of parameters for the debug hook. Maximum of 20 entries.
- Local
Path string - Local path where debug output is written.
- S3Output
Path string - S3 URI where debug output is stored.
- Collection
Configurations []TrainingJob Debug Hook Config Collection Configuration - List of tensor collections to configure for the debug hook. Maximum of 20. See
collectionConfigurationsbelow. - Hook
Parameters map[string]string - Map of parameters for the debug hook. Maximum of 20 entries.
- Local
Path string - Local path where debug output is written.
- s3Output
Path String - S3 URI where debug output is stored.
- collection
Configurations List<TrainingJob Debug Hook Config Collection Configuration> - List of tensor collections to configure for the debug hook. Maximum of 20. See
collectionConfigurationsbelow. - hook
Parameters Map<String,String> - Map of parameters for the debug hook. Maximum of 20 entries.
- local
Path String - Local path where debug output is written.
- s3Output
Path string - S3 URI where debug output is stored.
- collection
Configurations TrainingJob Debug Hook Config Collection Configuration[] - List of tensor collections to configure for the debug hook. Maximum of 20. See
collectionConfigurationsbelow. - hook
Parameters {[key: string]: string} - Map of parameters for the debug hook. Maximum of 20 entries.
- local
Path string - Local path where debug output is written.
- s3_
output_ strpath - S3 URI where debug output is stored.
- collection_
configurations Sequence[TrainingJob Debug Hook Config Collection Configuration] - List of tensor collections to configure for the debug hook. Maximum of 20. See
collectionConfigurationsbelow. - hook_
parameters Mapping[str, str] - Map of parameters for the debug hook. Maximum of 20 entries.
- local_
path str - Local path where debug output is written.
- s3Output
Path String - S3 URI where debug output is stored.
- collection
Configurations List<Property Map> - List of tensor collections to configure for the debug hook. Maximum of 20. See
collectionConfigurationsbelow. - hook
Parameters Map<String> - Map of parameters for the debug hook. Maximum of 20 entries.
- local
Path String - Local path where debug output is written.
TrainingJobDebugHookConfigCollectionConfiguration, TrainingJobDebugHookConfigCollectionConfigurationArgs
- Collection
Name string - Name of the tensor collection.
- Collection
Parameters Dictionary<string, string> - Map of parameters for the tensor collection.
- Collection
Name string - Name of the tensor collection.
- Collection
Parameters map[string]string - Map of parameters for the tensor collection.
- collection
Name String - Name of the tensor collection.
- collection
Parameters Map<String,String> - Map of parameters for the tensor collection.
- collection
Name string - Name of the tensor collection.
- collection
Parameters {[key: string]: string} - Map of parameters for the tensor collection.
- collection_
name str - Name of the tensor collection.
- collection_
parameters Mapping[str, str] - Map of parameters for the tensor collection.
- collection
Name String - Name of the tensor collection.
- collection
Parameters Map<String> - Map of parameters for the tensor collection.
TrainingJobDebugRuleConfiguration, TrainingJobDebugRuleConfigurationArgs
- Rule
Configuration stringName - Name of the rule configuration. Must be between 1 and 256 characters.
- Rule
Evaluator stringImage - Docker image URI for the rule evaluator.
- Instance
Type string - Instance type to deploy for the debug rule evaluation. Valid values are SageMaker AI processing instance types.
- Local
Path string - Local path where debug rule output is written.
- Rule
Parameters Dictionary<string, string> - Map of parameters for the rule configuration. Maximum of 100 entries.
- S3Output
Path string - S3 URI where rule output is stored.
- Volume
Size intIn Gb - Size of the storage volume for the rule evaluator, in GB.
- Rule
Configuration stringName - Name of the rule configuration. Must be between 1 and 256 characters.
- Rule
Evaluator stringImage - Docker image URI for the rule evaluator.
- Instance
Type string - Instance type to deploy for the debug rule evaluation. Valid values are SageMaker AI processing instance types.
- Local
Path string - Local path where debug rule output is written.
- Rule
Parameters map[string]string - Map of parameters for the rule configuration. Maximum of 100 entries.
- S3Output
Path string - S3 URI where rule output is stored.
- Volume
Size intIn Gb - Size of the storage volume for the rule evaluator, in GB.
- rule
Configuration StringName - Name of the rule configuration. Must be between 1 and 256 characters.
- rule
Evaluator StringImage - Docker image URI for the rule evaluator.
- instance
Type String - Instance type to deploy for the debug rule evaluation. Valid values are SageMaker AI processing instance types.
- local
Path String - Local path where debug rule output is written.
- rule
Parameters Map<String,String> - Map of parameters for the rule configuration. Maximum of 100 entries.
- s3Output
Path String - S3 URI where rule output is stored.
- volume
Size IntegerIn Gb - Size of the storage volume for the rule evaluator, in GB.
- rule
Configuration stringName - Name of the rule configuration. Must be between 1 and 256 characters.
- rule
Evaluator stringImage - Docker image URI for the rule evaluator.
- instance
Type string - Instance type to deploy for the debug rule evaluation. Valid values are SageMaker AI processing instance types.
- local
Path string - Local path where debug rule output is written.
- rule
Parameters {[key: string]: string} - Map of parameters for the rule configuration. Maximum of 100 entries.
- s3Output
Path string - S3 URI where rule output is stored.
- volume
Size numberIn Gb - Size of the storage volume for the rule evaluator, in GB.
- rule_
configuration_ strname - Name of the rule configuration. Must be between 1 and 256 characters.
- rule_
evaluator_ strimage - Docker image URI for the rule evaluator.
- instance_
type str - Instance type to deploy for the debug rule evaluation. Valid values are SageMaker AI processing instance types.
- local_
path str - Local path where debug rule output is written.
- rule_
parameters Mapping[str, str] - Map of parameters for the rule configuration. Maximum of 100 entries.
- s3_
output_ strpath - S3 URI where rule output is stored.
- volume_
size_ intin_ gb - Size of the storage volume for the rule evaluator, in GB.
- rule
Configuration StringName - Name of the rule configuration. Must be between 1 and 256 characters.
- rule
Evaluator StringImage - Docker image URI for the rule evaluator.
- instance
Type String - Instance type to deploy for the debug rule evaluation. Valid values are SageMaker AI processing instance types.
- local
Path String - Local path where debug rule output is written.
- rule
Parameters Map<String> - Map of parameters for the rule configuration. Maximum of 100 entries.
- s3Output
Path String - S3 URI where rule output is stored.
- volume
Size NumberIn Gb - Size of the storage volume for the rule evaluator, in GB.
TrainingJobExperimentConfig, TrainingJobExperimentConfigArgs
- Experiment
Name string - Name of the SageMaker AI Experiment to associate with.
- Run
Name string - Name of the Experiment Run to associate with.
- Trial
Component stringDisplay Name - Display name for the trial component.
- Trial
Name string - Name of the SageMaker AI Trial to associate with.
- Experiment
Name string - Name of the SageMaker AI Experiment to associate with.
- Run
Name string - Name of the Experiment Run to associate with.
- Trial
Component stringDisplay Name - Display name for the trial component.
- Trial
Name string - Name of the SageMaker AI Trial to associate with.
- experiment
Name String - Name of the SageMaker AI Experiment to associate with.
- run
Name String - Name of the Experiment Run to associate with.
- trial
Component StringDisplay Name - Display name for the trial component.
- trial
Name String - Name of the SageMaker AI Trial to associate with.
- experiment
Name string - Name of the SageMaker AI Experiment to associate with.
- run
Name string - Name of the Experiment Run to associate with.
- trial
Component stringDisplay Name - Display name for the trial component.
- trial
Name string - Name of the SageMaker AI Trial to associate with.
- experiment_
name str - Name of the SageMaker AI Experiment to associate with.
- run_
name str - Name of the Experiment Run to associate with.
- trial_
component_ strdisplay_ name - Display name for the trial component.
- trial_
name str - Name of the SageMaker AI Trial to associate with.
- experiment
Name String - Name of the SageMaker AI Experiment to associate with.
- run
Name String - Name of the Experiment Run to associate with.
- trial
Component StringDisplay Name - Display name for the trial component.
- trial
Name String - Name of the SageMaker AI Trial to associate with.
TrainingJobInfraCheckConfig, TrainingJobInfraCheckConfigArgs
- Enable
Infra boolCheck - Whether to enable infrastructure health checks before training.
- Enable
Infra boolCheck - Whether to enable infrastructure health checks before training.
- enable
Infra BooleanCheck - Whether to enable infrastructure health checks before training.
- enable
Infra booleanCheck - Whether to enable infrastructure health checks before training.
- enable_
infra_ boolcheck - Whether to enable infrastructure health checks before training.
- enable
Infra BooleanCheck - Whether to enable infrastructure health checks before training.
TrainingJobInputDataConfig, TrainingJobInputDataConfigArgs
- Channel
Name string - Name of the channel. Must be between 1 and 64 characters.
- Compression
Type string - Compression type for the input data. Valid values:
None,Gzip. - Content
Type string - MIME type of the input data.
- Data
Source TrainingJob Input Data Config Data Source - Location of the channel data. See
dataSourcebelow. - Input
Mode string - Input mode for the channel data. Valid values:
File,Pipe,FastFile. - Record
Wrapper stringType - Record wrapper type. Valid values:
None,RecordIO. - Shuffle
Config TrainingJob Input Data Config Shuffle Config - Configuration for shuffling data in the channel. See
shuffleConfigbelow.
- Channel
Name string - Name of the channel. Must be between 1 and 64 characters.
- Compression
Type string - Compression type for the input data. Valid values:
None,Gzip. - Content
Type string - MIME type of the input data.
- Data
Source TrainingJob Input Data Config Data Source - Location of the channel data. See
dataSourcebelow. - Input
Mode string - Input mode for the channel data. Valid values:
File,Pipe,FastFile. - Record
Wrapper stringType - Record wrapper type. Valid values:
None,RecordIO. - Shuffle
Config TrainingJob Input Data Config Shuffle Config - Configuration for shuffling data in the channel. See
shuffleConfigbelow.
- channel
Name String - Name of the channel. Must be between 1 and 64 characters.
- compression
Type String - Compression type for the input data. Valid values:
None,Gzip. - content
Type String - MIME type of the input data.
- data
Source TrainingJob Input Data Config Data Source - Location of the channel data. See
dataSourcebelow. - input
Mode String - Input mode for the channel data. Valid values:
File,Pipe,FastFile. - record
Wrapper StringType - Record wrapper type. Valid values:
None,RecordIO. - shuffle
Config TrainingJob Input Data Config Shuffle Config - Configuration for shuffling data in the channel. See
shuffleConfigbelow.
- channel
Name string - Name of the channel. Must be between 1 and 64 characters.
- compression
Type string - Compression type for the input data. Valid values:
None,Gzip. - content
Type string - MIME type of the input data.
- data
Source TrainingJob Input Data Config Data Source - Location of the channel data. See
dataSourcebelow. - input
Mode string - Input mode for the channel data. Valid values:
File,Pipe,FastFile. - record
Wrapper stringType - Record wrapper type. Valid values:
None,RecordIO. - shuffle
Config TrainingJob Input Data Config Shuffle Config - Configuration for shuffling data in the channel. See
shuffleConfigbelow.
- channel_
name str - Name of the channel. Must be between 1 and 64 characters.
- compression_
type str - Compression type for the input data. Valid values:
None,Gzip. - content_
type str - MIME type of the input data.
- data_
source TrainingJob Input Data Config Data Source - Location of the channel data. See
dataSourcebelow. - input_
mode str - Input mode for the channel data. Valid values:
File,Pipe,FastFile. - record_
wrapper_ strtype - Record wrapper type. Valid values:
None,RecordIO. - shuffle_
config TrainingJob Input Data Config Shuffle Config - Configuration for shuffling data in the channel. See
shuffleConfigbelow.
- channel
Name String - Name of the channel. Must be between 1 and 64 characters.
- compression
Type String - Compression type for the input data. Valid values:
None,Gzip. - content
Type String - MIME type of the input data.
- data
Source Property Map - Location of the channel data. See
dataSourcebelow. - input
Mode String - Input mode for the channel data. Valid values:
File,Pipe,FastFile. - record
Wrapper StringType - Record wrapper type. Valid values:
None,RecordIO. - shuffle
Config Property Map - Configuration for shuffling data in the channel. See
shuffleConfigbelow.
TrainingJobInputDataConfigDataSource, TrainingJobInputDataConfigDataSourceArgs
- File
System TrainingData Source Job Input Data Config Data Source File System Data Source - File system data source. See
fileSystemDataSourcebelow. - S3Data
Source TrainingJob Input Data Config Data Source S3Data Source - S3 data source. See
s3DataSourcebelow.
- File
System TrainingData Source Job Input Data Config Data Source File System Data Source - File system data source. See
fileSystemDataSourcebelow. - S3Data
Source TrainingJob Input Data Config Data Source S3Data Source - S3 data source. See
s3DataSourcebelow.
- file
System TrainingData Source Job Input Data Config Data Source File System Data Source - File system data source. See
fileSystemDataSourcebelow. - s3Data
Source TrainingJob Input Data Config Data Source S3Data Source - S3 data source. See
s3DataSourcebelow.
- file
System TrainingData Source Job Input Data Config Data Source File System Data Source - File system data source. See
fileSystemDataSourcebelow. - s3Data
Source TrainingJob Input Data Config Data Source S3Data Source - S3 data source. See
s3DataSourcebelow.
- file_
system_ Trainingdata_ source Job Input Data Config Data Source File System Data Source - File system data source. See
fileSystemDataSourcebelow. - s3_
data_ Trainingsource Job Input Data Config Data Source S3Data Source - S3 data source. See
s3DataSourcebelow.
- file
System Property MapData Source - File system data source. See
fileSystemDataSourcebelow. - s3Data
Source Property Map - S3 data source. See
s3DataSourcebelow.
TrainingJobInputDataConfigDataSourceFileSystemDataSource, TrainingJobInputDataConfigDataSourceFileSystemDataSourceArgs
- Directory
Path string - Full path to the directory on the file system.
- File
System stringAccess Mode - Access mode for the file system. Valid values:
ro,rw. - File
System stringId - File system ID.
- File
System stringType - File system type. Valid values:
EFS,FSxLustre.
- Directory
Path string - Full path to the directory on the file system.
- File
System stringAccess Mode - Access mode for the file system. Valid values:
ro,rw. - File
System stringId - File system ID.
- File
System stringType - File system type. Valid values:
EFS,FSxLustre.
- directory
Path String - Full path to the directory on the file system.
- file
System StringAccess Mode - Access mode for the file system. Valid values:
ro,rw. - file
System StringId - File system ID.
- file
System StringType - File system type. Valid values:
EFS,FSxLustre.
- directory
Path string - Full path to the directory on the file system.
- file
System stringAccess Mode - Access mode for the file system. Valid values:
ro,rw. - file
System stringId - File system ID.
- file
System stringType - File system type. Valid values:
EFS,FSxLustre.
- directory_
path str - Full path to the directory on the file system.
- file_
system_ straccess_ mode - Access mode for the file system. Valid values:
ro,rw. - file_
system_ strid - File system ID.
- file_
system_ strtype - File system type. Valid values:
EFS,FSxLustre.
- directory
Path String - Full path to the directory on the file system.
- file
System StringAccess Mode - Access mode for the file system. Valid values:
ro,rw. - file
System StringId - File system ID.
- file
System StringType - File system type. Valid values:
EFS,FSxLustre.
TrainingJobInputDataConfigDataSourceS3DataSource, TrainingJobInputDataConfigDataSourceS3DataSourceArgs
- S3Data
Type string - S3 data type. Valid values:
ManifestFile,S3Prefix,AugmentedManifestFile. - S3Uri string
- S3 URI of the data.
- Attribute
Names List<string> - List of attribute names to include in the training dataset. Maximum of 16.
- Hub
Access TrainingConfig Job Input Data Config Data Source S3Data Source Hub Access Config - SageMaker AI Hub access configuration. See
hubAccessConfigbelow. - Instance
Group List<string>Names - List of instance group names for the training data distribution. Maximum of 5.
- Model
Access TrainingConfig Job Input Data Config Data Source S3Data Source Model Access Config - Model access configuration. See
modelAccessConfigbelow. - S3Data
Distribution stringType - Distribution type for S3 data. Valid values:
FullyReplicated,ShardedByS3Key.
- S3Data
Type string - S3 data type. Valid values:
ManifestFile,S3Prefix,AugmentedManifestFile. - S3Uri string
- S3 URI of the data.
- Attribute
Names []string - List of attribute names to include in the training dataset. Maximum of 16.
- Hub
Access TrainingConfig Job Input Data Config Data Source S3Data Source Hub Access Config - SageMaker AI Hub access configuration. See
hubAccessConfigbelow. - Instance
Group []stringNames - List of instance group names for the training data distribution. Maximum of 5.
- Model
Access TrainingConfig Job Input Data Config Data Source S3Data Source Model Access Config - Model access configuration. See
modelAccessConfigbelow. - S3Data
Distribution stringType - Distribution type for S3 data. Valid values:
FullyReplicated,ShardedByS3Key.
- s3Data
Type String - S3 data type. Valid values:
ManifestFile,S3Prefix,AugmentedManifestFile. - s3Uri String
- S3 URI of the data.
- attribute
Names List<String> - List of attribute names to include in the training dataset. Maximum of 16.
- hub
Access TrainingConfig Job Input Data Config Data Source S3Data Source Hub Access Config - SageMaker AI Hub access configuration. See
hubAccessConfigbelow. - instance
Group List<String>Names - List of instance group names for the training data distribution. Maximum of 5.
- model
Access TrainingConfig Job Input Data Config Data Source S3Data Source Model Access Config - Model access configuration. See
modelAccessConfigbelow. - s3Data
Distribution StringType - Distribution type for S3 data. Valid values:
FullyReplicated,ShardedByS3Key.
- s3Data
Type string - S3 data type. Valid values:
ManifestFile,S3Prefix,AugmentedManifestFile. - s3Uri string
- S3 URI of the data.
- attribute
Names string[] - List of attribute names to include in the training dataset. Maximum of 16.
- hub
Access TrainingConfig Job Input Data Config Data Source S3Data Source Hub Access Config - SageMaker AI Hub access configuration. See
hubAccessConfigbelow. - instance
Group string[]Names - List of instance group names for the training data distribution. Maximum of 5.
- model
Access TrainingConfig Job Input Data Config Data Source S3Data Source Model Access Config - Model access configuration. See
modelAccessConfigbelow. - s3Data
Distribution stringType - Distribution type for S3 data. Valid values:
FullyReplicated,ShardedByS3Key.
- s3_
data_ strtype - S3 data type. Valid values:
ManifestFile,S3Prefix,AugmentedManifestFile. - s3_
uri str - S3 URI of the data.
- attribute_
names Sequence[str] - List of attribute names to include in the training dataset. Maximum of 16.
- hub_
access_ Trainingconfig Job Input Data Config Data Source S3Data Source Hub Access Config - SageMaker AI Hub access configuration. See
hubAccessConfigbelow. - instance_
group_ Sequence[str]names - List of instance group names for the training data distribution. Maximum of 5.
- model_
access_ Trainingconfig Job Input Data Config Data Source S3Data Source Model Access Config - Model access configuration. See
modelAccessConfigbelow. - s3_
data_ strdistribution_ type - Distribution type for S3 data. Valid values:
FullyReplicated,ShardedByS3Key.
- s3Data
Type String - S3 data type. Valid values:
ManifestFile,S3Prefix,AugmentedManifestFile. - s3Uri String
- S3 URI of the data.
- attribute
Names List<String> - List of attribute names to include in the training dataset. Maximum of 16.
- hub
Access Property MapConfig - SageMaker AI Hub access configuration. See
hubAccessConfigbelow. - instance
Group List<String>Names - List of instance group names for the training data distribution. Maximum of 5.
- model
Access Property MapConfig - Model access configuration. See
modelAccessConfigbelow. - s3Data
Distribution StringType - Distribution type for S3 data. Valid values:
FullyReplicated,ShardedByS3Key.
TrainingJobInputDataConfigDataSourceS3DataSourceHubAccessConfig, TrainingJobInputDataConfigDataSourceS3DataSourceHubAccessConfigArgs
- Hub
Content stringArn - ARN of the hub content.
- Hub
Content stringArn - ARN of the hub content.
- hub
Content StringArn - ARN of the hub content.
- hub
Content stringArn - ARN of the hub content.
- hub_
content_ strarn - ARN of the hub content.
- hub
Content StringArn - ARN of the hub content.
TrainingJobInputDataConfigDataSourceS3DataSourceModelAccessConfig, TrainingJobInputDataConfigDataSourceS3DataSourceModelAccessConfigArgs
- Accept
Eula bool - Whether to accept the model EULA.
- Accept
Eula bool - Whether to accept the model EULA.
- accept
Eula Boolean - Whether to accept the model EULA.
- accept
Eula boolean - Whether to accept the model EULA.
- accept_
eula bool - Whether to accept the model EULA.
- accept
Eula Boolean - Whether to accept the model EULA.
TrainingJobInputDataConfigShuffleConfig, TrainingJobInputDataConfigShuffleConfigArgs
- Seed int
- Seed value used to shuffle the training data.
- Seed int
- Seed value used to shuffle the training data.
- seed Integer
- Seed value used to shuffle the training data.
- seed number
- Seed value used to shuffle the training data.
- seed int
- Seed value used to shuffle the training data.
- seed Number
- Seed value used to shuffle the training data.
TrainingJobMlflowConfig, TrainingJobMlflowConfigArgs
- Mlflow
Resource stringArn - ARN of the MLflow tracking server.
- Mlflow
Experiment stringName - Name of the MLflow experiment.
- Mlflow
Run stringName - Name of the MLflow run.
- Mlflow
Resource stringArn - ARN of the MLflow tracking server.
- Mlflow
Experiment stringName - Name of the MLflow experiment.
- Mlflow
Run stringName - Name of the MLflow run.
- mlflow
Resource StringArn - ARN of the MLflow tracking server.
- mlflow
Experiment StringName - Name of the MLflow experiment.
- mlflow
Run StringName - Name of the MLflow run.
- mlflow
Resource stringArn - ARN of the MLflow tracking server.
- mlflow
Experiment stringName - Name of the MLflow experiment.
- mlflow
Run stringName - Name of the MLflow run.
- mlflow_
resource_ strarn - ARN of the MLflow tracking server.
- mlflow_
experiment_ strname - Name of the MLflow experiment.
- mlflow_
run_ strname - Name of the MLflow run.
- mlflow
Resource StringArn - ARN of the MLflow tracking server.
- mlflow
Experiment StringName - Name of the MLflow experiment.
- mlflow
Run StringName - Name of the MLflow run.
TrainingJobModelPackageConfig, TrainingJobModelPackageConfigArgs
- Model
Package stringGroup Arn - ARN of the model package group.
- Source
Model stringPackage Arn - ARN of the source model package.
- Model
Package stringGroup Arn - ARN of the model package group.
- Source
Model stringPackage Arn - ARN of the source model package.
- model
Package StringGroup Arn - ARN of the model package group.
- source
Model StringPackage Arn - ARN of the source model package.
- model
Package stringGroup Arn - ARN of the model package group.
- source
Model stringPackage Arn - ARN of the source model package.
- model_
package_ strgroup_ arn - ARN of the model package group.
- source_
model_ strpackage_ arn - ARN of the source model package.
- model
Package StringGroup Arn - ARN of the model package group.
- source
Model StringPackage Arn - ARN of the source model package.
TrainingJobOutputDataConfig, TrainingJobOutputDataConfigArgs
- S3Output
Path string - S3 URI where output data is stored.
- Compression
Type string - Output compression type. Valid values:
GZIP,NONE. - Kms
Key stringId - KMS key ID used to encrypt the output data.
- S3Output
Path string - S3 URI where output data is stored.
- Compression
Type string - Output compression type. Valid values:
GZIP,NONE. - Kms
Key stringId - KMS key ID used to encrypt the output data.
- s3Output
Path String - S3 URI where output data is stored.
- compression
Type String - Output compression type. Valid values:
GZIP,NONE. - kms
Key StringId - KMS key ID used to encrypt the output data.
- s3Output
Path string - S3 URI where output data is stored.
- compression
Type string - Output compression type. Valid values:
GZIP,NONE. - kms
Key stringId - KMS key ID used to encrypt the output data.
- s3_
output_ strpath - S3 URI where output data is stored.
- compression_
type str - Output compression type. Valid values:
GZIP,NONE. - kms_
key_ strid - KMS key ID used to encrypt the output data.
- s3Output
Path String - S3 URI where output data is stored.
- compression
Type String - Output compression type. Valid values:
GZIP,NONE. - kms
Key StringId - KMS key ID used to encrypt the output data.
TrainingJobProfilerConfig, TrainingJobProfilerConfigArgs
- Disable
Profiler bool - Whether to disable the profiler.
- Profiling
Interval intIn Milliseconds - Time interval in milliseconds for capturing system metrics. Valid values:
100,200,500,1000,5000,60000. - Profiling
Parameters Dictionary<string, string> - Map of profiling parameters. Maximum of 20 entries.
- S3Output
Path string - S3 URI where profiler output is stored.
- Disable
Profiler bool - Whether to disable the profiler.
- Profiling
Interval intIn Milliseconds - Time interval in milliseconds for capturing system metrics. Valid values:
100,200,500,1000,5000,60000. - Profiling
Parameters map[string]string - Map of profiling parameters. Maximum of 20 entries.
- S3Output
Path string - S3 URI where profiler output is stored.
- disable
Profiler Boolean - Whether to disable the profiler.
- profiling
Interval IntegerIn Milliseconds - Time interval in milliseconds for capturing system metrics. Valid values:
100,200,500,1000,5000,60000. - profiling
Parameters Map<String,String> - Map of profiling parameters. Maximum of 20 entries.
- s3Output
Path String - S3 URI where profiler output is stored.
- disable
Profiler boolean - Whether to disable the profiler.
- profiling
Interval numberIn Milliseconds - Time interval in milliseconds for capturing system metrics. Valid values:
100,200,500,1000,5000,60000. - profiling
Parameters {[key: string]: string} - Map of profiling parameters. Maximum of 20 entries.
- s3Output
Path string - S3 URI where profiler output is stored.
- disable_
profiler bool - Whether to disable the profiler.
- profiling_
interval_ intin_ milliseconds - Time interval in milliseconds for capturing system metrics. Valid values:
100,200,500,1000,5000,60000. - profiling_
parameters Mapping[str, str] - Map of profiling parameters. Maximum of 20 entries.
- s3_
output_ strpath - S3 URI where profiler output is stored.
- disable
Profiler Boolean - Whether to disable the profiler.
- profiling
Interval NumberIn Milliseconds - Time interval in milliseconds for capturing system metrics. Valid values:
100,200,500,1000,5000,60000. - profiling
Parameters Map<String> - Map of profiling parameters. Maximum of 20 entries.
- s3Output
Path String - S3 URI where profiler output is stored.
TrainingJobProfilerRuleConfiguration, TrainingJobProfilerRuleConfigurationArgs
- Rule
Configuration stringName - Name of the profiler rule configuration. Must be between 1 and 256 characters.
- Rule
Evaluator stringImage - Docker image URI for the profiler rule evaluator.
- Instance
Type string - Instance type to deploy for the profiler rule evaluation. Valid values are SageMaker AI processing instance types.
- Local
Path string - Local path where profiler rule output is written.
- Rule
Parameters Dictionary<string, string> - Map of parameters for the profiler rule. Maximum of 100 entries.
- S3Output
Path string - S3 URI where profiler rule output is stored.
- Volume
Size intIn Gb - Size of the storage volume for the profiler rule evaluator, in GB.
- Rule
Configuration stringName - Name of the profiler rule configuration. Must be between 1 and 256 characters.
- Rule
Evaluator stringImage - Docker image URI for the profiler rule evaluator.
- Instance
Type string - Instance type to deploy for the profiler rule evaluation. Valid values are SageMaker AI processing instance types.
- Local
Path string - Local path where profiler rule output is written.
- Rule
Parameters map[string]string - Map of parameters for the profiler rule. Maximum of 100 entries.
- S3Output
Path string - S3 URI where profiler rule output is stored.
- Volume
Size intIn Gb - Size of the storage volume for the profiler rule evaluator, in GB.
- rule
Configuration StringName - Name of the profiler rule configuration. Must be between 1 and 256 characters.
- rule
Evaluator StringImage - Docker image URI for the profiler rule evaluator.
- instance
Type String - Instance type to deploy for the profiler rule evaluation. Valid values are SageMaker AI processing instance types.
- local
Path String - Local path where profiler rule output is written.
- rule
Parameters Map<String,String> - Map of parameters for the profiler rule. Maximum of 100 entries.
- s3Output
Path String - S3 URI where profiler rule output is stored.
- volume
Size IntegerIn Gb - Size of the storage volume for the profiler rule evaluator, in GB.
- rule
Configuration stringName - Name of the profiler rule configuration. Must be between 1 and 256 characters.
- rule
Evaluator stringImage - Docker image URI for the profiler rule evaluator.
- instance
Type string - Instance type to deploy for the profiler rule evaluation. Valid values are SageMaker AI processing instance types.
- local
Path string - Local path where profiler rule output is written.
- rule
Parameters {[key: string]: string} - Map of parameters for the profiler rule. Maximum of 100 entries.
- s3Output
Path string - S3 URI where profiler rule output is stored.
- volume
Size numberIn Gb - Size of the storage volume for the profiler rule evaluator, in GB.
- rule_
configuration_ strname - Name of the profiler rule configuration. Must be between 1 and 256 characters.
- rule_
evaluator_ strimage - Docker image URI for the profiler rule evaluator.
- instance_
type str - Instance type to deploy for the profiler rule evaluation. Valid values are SageMaker AI processing instance types.
- local_
path str - Local path where profiler rule output is written.
- rule_
parameters Mapping[str, str] - Map of parameters for the profiler rule. Maximum of 100 entries.
- s3_
output_ strpath - S3 URI where profiler rule output is stored.
- volume_
size_ intin_ gb - Size of the storage volume for the profiler rule evaluator, in GB.
- rule
Configuration StringName - Name of the profiler rule configuration. Must be between 1 and 256 characters.
- rule
Evaluator StringImage - Docker image URI for the profiler rule evaluator.
- instance
Type String - Instance type to deploy for the profiler rule evaluation. Valid values are SageMaker AI processing instance types.
- local
Path String - Local path where profiler rule output is written.
- rule
Parameters Map<String> - Map of parameters for the profiler rule. Maximum of 100 entries.
- s3Output
Path String - S3 URI where profiler rule output is stored.
- volume
Size NumberIn Gb - Size of the storage volume for the profiler rule evaluator, in GB.
TrainingJobRemoteDebugConfig, TrainingJobRemoteDebugConfigArgs
- Enable
Remote boolDebug - Whether to enable remote debugging for the training job.
- Enable
Remote boolDebug - Whether to enable remote debugging for the training job.
- enable
Remote BooleanDebug - Whether to enable remote debugging for the training job.
- enable
Remote booleanDebug - Whether to enable remote debugging for the training job.
- enable_
remote_ booldebug - Whether to enable remote debugging for the training job.
- enable
Remote BooleanDebug - Whether to enable remote debugging for the training job.
TrainingJobResourceConfig, TrainingJobResourceConfigArgs
- Instance
Count int - Number of ML compute instances to use. Conflicts with
instanceGroups. - Instance
Groups List<TrainingJob Resource Config Instance Group> - List of instance groups for heterogeneous cluster training. Maximum of 5. Conflicts with
instanceCount,instanceType, andkeepAlivePeriodInSeconds. SeeinstanceGroupsbelow. - Instance
Placement TrainingConfig Job Resource Config Instance Placement Config - Instance placement configuration. See
instancePlacementConfigbelow. - Instance
Type string - ML compute instance type. Conflicts with
instanceGroups. - Keep
Alive intPeriod In Seconds - Time in seconds to keep instances alive after training completes, for warm pool reuse. Valid values: 0–3600. Conflicts with
instanceGroups. - Training
Plan stringArn - ARN of the training plan to use.
- Volume
Kms stringKey Id - KMS key ID used to encrypt data on the storage volume.
- Volume
Size intIn Gb - Size of the storage volume attached to each instance, in GB.
- Instance
Count int - Number of ML compute instances to use. Conflicts with
instanceGroups. - Instance
Groups []TrainingJob Resource Config Instance Group - List of instance groups for heterogeneous cluster training. Maximum of 5. Conflicts with
instanceCount,instanceType, andkeepAlivePeriodInSeconds. SeeinstanceGroupsbelow. - Instance
Placement TrainingConfig Job Resource Config Instance Placement Config - Instance placement configuration. See
instancePlacementConfigbelow. - Instance
Type string - ML compute instance type. Conflicts with
instanceGroups. - Keep
Alive intPeriod In Seconds - Time in seconds to keep instances alive after training completes, for warm pool reuse. Valid values: 0–3600. Conflicts with
instanceGroups. - Training
Plan stringArn - ARN of the training plan to use.
- Volume
Kms stringKey Id - KMS key ID used to encrypt data on the storage volume.
- Volume
Size intIn Gb - Size of the storage volume attached to each instance, in GB.
- instance
Count Integer - Number of ML compute instances to use. Conflicts with
instanceGroups. - instance
Groups List<TrainingJob Resource Config Instance Group> - List of instance groups for heterogeneous cluster training. Maximum of 5. Conflicts with
instanceCount,instanceType, andkeepAlivePeriodInSeconds. SeeinstanceGroupsbelow. - instance
Placement TrainingConfig Job Resource Config Instance Placement Config - Instance placement configuration. See
instancePlacementConfigbelow. - instance
Type String - ML compute instance type. Conflicts with
instanceGroups. - keep
Alive IntegerPeriod In Seconds - Time in seconds to keep instances alive after training completes, for warm pool reuse. Valid values: 0–3600. Conflicts with
instanceGroups. - training
Plan StringArn - ARN of the training plan to use.
- volume
Kms StringKey Id - KMS key ID used to encrypt data on the storage volume.
- volume
Size IntegerIn Gb - Size of the storage volume attached to each instance, in GB.
- instance
Count number - Number of ML compute instances to use. Conflicts with
instanceGroups. - instance
Groups TrainingJob Resource Config Instance Group[] - List of instance groups for heterogeneous cluster training. Maximum of 5. Conflicts with
instanceCount,instanceType, andkeepAlivePeriodInSeconds. SeeinstanceGroupsbelow. - instance
Placement TrainingConfig Job Resource Config Instance Placement Config - Instance placement configuration. See
instancePlacementConfigbelow. - instance
Type string - ML compute instance type. Conflicts with
instanceGroups. - keep
Alive numberPeriod In Seconds - Time in seconds to keep instances alive after training completes, for warm pool reuse. Valid values: 0–3600. Conflicts with
instanceGroups. - training
Plan stringArn - ARN of the training plan to use.
- volume
Kms stringKey Id - KMS key ID used to encrypt data on the storage volume.
- volume
Size numberIn Gb - Size of the storage volume attached to each instance, in GB.
- instance_
count int - Number of ML compute instances to use. Conflicts with
instanceGroups. - instance_
groups Sequence[TrainingJob Resource Config Instance Group] - List of instance groups for heterogeneous cluster training. Maximum of 5. Conflicts with
instanceCount,instanceType, andkeepAlivePeriodInSeconds. SeeinstanceGroupsbelow. - instance_
placement_ Trainingconfig Job Resource Config Instance Placement Config - Instance placement configuration. See
instancePlacementConfigbelow. - instance_
type str - ML compute instance type. Conflicts with
instanceGroups. - keep_
alive_ intperiod_ in_ seconds - Time in seconds to keep instances alive after training completes, for warm pool reuse. Valid values: 0–3600. Conflicts with
instanceGroups. - training_
plan_ strarn - ARN of the training plan to use.
- volume_
kms_ strkey_ id - KMS key ID used to encrypt data on the storage volume.
- volume_
size_ intin_ gb - Size of the storage volume attached to each instance, in GB.
- instance
Count Number - Number of ML compute instances to use. Conflicts with
instanceGroups. - instance
Groups List<Property Map> - List of instance groups for heterogeneous cluster training. Maximum of 5. Conflicts with
instanceCount,instanceType, andkeepAlivePeriodInSeconds. SeeinstanceGroupsbelow. - instance
Placement Property MapConfig - Instance placement configuration. See
instancePlacementConfigbelow. - instance
Type String - ML compute instance type. Conflicts with
instanceGroups. - keep
Alive NumberPeriod In Seconds - Time in seconds to keep instances alive after training completes, for warm pool reuse. Valid values: 0–3600. Conflicts with
instanceGroups. - training
Plan StringArn - ARN of the training plan to use.
- volume
Kms StringKey Id - KMS key ID used to encrypt data on the storage volume.
- volume
Size NumberIn Gb - Size of the storage volume attached to each instance, in GB.
TrainingJobResourceConfigInstanceGroup, TrainingJobResourceConfigInstanceGroupArgs
- Instance
Count int - Number of instances in the group.
- Instance
Group stringName - Name of the instance group.
- Instance
Type string - ML compute instance type for the group.
- Instance
Count int - Number of instances in the group.
- Instance
Group stringName - Name of the instance group.
- Instance
Type string - ML compute instance type for the group.
- instance
Count Integer - Number of instances in the group.
- instance
Group StringName - Name of the instance group.
- instance
Type String - ML compute instance type for the group.
- instance
Count number - Number of instances in the group.
- instance
Group stringName - Name of the instance group.
- instance
Type string - ML compute instance type for the group.
- instance_
count int - Number of instances in the group.
- instance_
group_ strname - Name of the instance group.
- instance_
type str - ML compute instance type for the group.
- instance
Count Number - Number of instances in the group.
- instance
Group StringName - Name of the instance group.
- instance
Type String - ML compute instance type for the group.
TrainingJobResourceConfigInstancePlacementConfig, TrainingJobResourceConfigInstancePlacementConfigArgs
- Enable
Multiple boolJobs - Whether to enable multiple jobs on the same instance.
- Placement
Specifications List<TrainingJob Resource Config Instance Placement Config Placement Specification> - Placement specifications for instance placement. See
placementSpecificationsbelow.
- Enable
Multiple boolJobs - Whether to enable multiple jobs on the same instance.
- Placement
Specifications []TrainingJob Resource Config Instance Placement Config Placement Specification - Placement specifications for instance placement. See
placementSpecificationsbelow.
- enable
Multiple BooleanJobs - Whether to enable multiple jobs on the same instance.
- placement
Specifications List<TrainingJob Resource Config Instance Placement Config Placement Specification> - Placement specifications for instance placement. See
placementSpecificationsbelow.
- enable
Multiple booleanJobs - Whether to enable multiple jobs on the same instance.
- placement
Specifications TrainingJob Resource Config Instance Placement Config Placement Specification[] - Placement specifications for instance placement. See
placementSpecificationsbelow.
- enable_
multiple_ booljobs - Whether to enable multiple jobs on the same instance.
- placement_
specifications Sequence[TrainingJob Resource Config Instance Placement Config Placement Specification] - Placement specifications for instance placement. See
placementSpecificationsbelow.
- enable
Multiple BooleanJobs - Whether to enable multiple jobs on the same instance.
- placement
Specifications List<Property Map> - Placement specifications for instance placement. See
placementSpecificationsbelow.
TrainingJobResourceConfigInstancePlacementConfigPlacementSpecification, TrainingJobResourceConfigInstancePlacementConfigPlacementSpecificationArgs
- Instance
Count int - Number of instances in the placement.
- Ultra
Server stringId - Ultra server ID for the placement.
- Instance
Count int - Number of instances in the placement.
- Ultra
Server stringId - Ultra server ID for the placement.
- instance
Count Integer - Number of instances in the placement.
- ultra
Server StringId - Ultra server ID for the placement.
- instance
Count number - Number of instances in the placement.
- ultra
Server stringId - Ultra server ID for the placement.
- instance_
count int - Number of instances in the placement.
- ultra_
server_ strid - Ultra server ID for the placement.
- instance
Count Number - Number of instances in the placement.
- ultra
Server StringId - Ultra server ID for the placement.
TrainingJobRetryStrategy, TrainingJobRetryStrategyArgs
- Maximum
Retry intAttempts - Maximum number of retry attempts. Valid values: 1–30.
- Maximum
Retry intAttempts - Maximum number of retry attempts. Valid values: 1–30.
- maximum
Retry IntegerAttempts - Maximum number of retry attempts. Valid values: 1–30.
- maximum
Retry numberAttempts - Maximum number of retry attempts. Valid values: 1–30.
- maximum_
retry_ intattempts - Maximum number of retry attempts. Valid values: 1–30.
- maximum
Retry NumberAttempts - Maximum number of retry attempts. Valid values: 1–30.
TrainingJobServerlessJobConfig, TrainingJobServerlessJobConfigArgs
- Base
Model stringArn - ARN of the base foundation model from the SageMaker AI Public Hub.
- Job
Type string - Serverless job type. Valid values:
FINE_TUNING,EVALUATION,DISTILLATION. - Accept
Eula bool - Whether to accept the model EULA.
- Customization
Technique string - Customization technique to apply. Valid values:
FINE_TUNING,DOMAIN_ADAPTION. - Evaluation
Type string - Evaluation type. Valid values:
AUTOMATIC,HUMAN,NONE. - Evaluator
Arn string - ARN of the evaluator.
- Peft string
- Parameter-Efficient Fine-Tuning (PEFT) method. Valid values:
LORA.
- Base
Model stringArn - ARN of the base foundation model from the SageMaker AI Public Hub.
- Job
Type string - Serverless job type. Valid values:
FINE_TUNING,EVALUATION,DISTILLATION. - Accept
Eula bool - Whether to accept the model EULA.
- Customization
Technique string - Customization technique to apply. Valid values:
FINE_TUNING,DOMAIN_ADAPTION. - Evaluation
Type string - Evaluation type. Valid values:
AUTOMATIC,HUMAN,NONE. - Evaluator
Arn string - ARN of the evaluator.
- Peft string
- Parameter-Efficient Fine-Tuning (PEFT) method. Valid values:
LORA.
- base
Model StringArn - ARN of the base foundation model from the SageMaker AI Public Hub.
- job
Type String - Serverless job type. Valid values:
FINE_TUNING,EVALUATION,DISTILLATION. - accept
Eula Boolean - Whether to accept the model EULA.
- customization
Technique String - Customization technique to apply. Valid values:
FINE_TUNING,DOMAIN_ADAPTION. - evaluation
Type String - Evaluation type. Valid values:
AUTOMATIC,HUMAN,NONE. - evaluator
Arn String - ARN of the evaluator.
- peft String
- Parameter-Efficient Fine-Tuning (PEFT) method. Valid values:
LORA.
- base
Model stringArn - ARN of the base foundation model from the SageMaker AI Public Hub.
- job
Type string - Serverless job type. Valid values:
FINE_TUNING,EVALUATION,DISTILLATION. - accept
Eula boolean - Whether to accept the model EULA.
- customization
Technique string - Customization technique to apply. Valid values:
FINE_TUNING,DOMAIN_ADAPTION. - evaluation
Type string - Evaluation type. Valid values:
AUTOMATIC,HUMAN,NONE. - evaluator
Arn string - ARN of the evaluator.
- peft string
- Parameter-Efficient Fine-Tuning (PEFT) method. Valid values:
LORA.
- base_
model_ strarn - ARN of the base foundation model from the SageMaker AI Public Hub.
- job_
type str - Serverless job type. Valid values:
FINE_TUNING,EVALUATION,DISTILLATION. - accept_
eula bool - Whether to accept the model EULA.
- customization_
technique str - Customization technique to apply. Valid values:
FINE_TUNING,DOMAIN_ADAPTION. - evaluation_
type str - Evaluation type. Valid values:
AUTOMATIC,HUMAN,NONE. - evaluator_
arn str - ARN of the evaluator.
- peft str
- Parameter-Efficient Fine-Tuning (PEFT) method. Valid values:
LORA.
- base
Model StringArn - ARN of the base foundation model from the SageMaker AI Public Hub.
- job
Type String - Serverless job type. Valid values:
FINE_TUNING,EVALUATION,DISTILLATION. - accept
Eula Boolean - Whether to accept the model EULA.
- customization
Technique String - Customization technique to apply. Valid values:
FINE_TUNING,DOMAIN_ADAPTION. - evaluation
Type String - Evaluation type. Valid values:
AUTOMATIC,HUMAN,NONE. - evaluator
Arn String - ARN of the evaluator.
- peft String
- Parameter-Efficient Fine-Tuning (PEFT) method. Valid values:
LORA.
TrainingJobSessionChainingConfig, TrainingJobSessionChainingConfigArgs
- Enable
Session boolTag Chaining - Whether to enable session tag chaining for the training job.
- Enable
Session boolTag Chaining - Whether to enable session tag chaining for the training job.
- enable
Session BooleanTag Chaining - Whether to enable session tag chaining for the training job.
- enable
Session booleanTag Chaining - Whether to enable session tag chaining for the training job.
- enable_
session_ booltag_ chaining - Whether to enable session tag chaining for the training job.
- enable
Session BooleanTag Chaining - Whether to enable session tag chaining for the training job.
TrainingJobStoppingCondition, TrainingJobStoppingConditionArgs
- Max
Pending intTime In Seconds - Maximum time in seconds a training job can be pending before it is stopped. Valid values: 7200–2419200.
- Max
Runtime intIn Seconds - Maximum time in seconds the training job can run before it is stopped.
- Max
Wait intTime In Seconds - Maximum time in seconds to wait for a managed spot training job to complete.
- Max
Pending intTime In Seconds - Maximum time in seconds a training job can be pending before it is stopped. Valid values: 7200–2419200.
- Max
Runtime intIn Seconds - Maximum time in seconds the training job can run before it is stopped.
- Max
Wait intTime In Seconds - Maximum time in seconds to wait for a managed spot training job to complete.
- max
Pending IntegerTime In Seconds - Maximum time in seconds a training job can be pending before it is stopped. Valid values: 7200–2419200.
- max
Runtime IntegerIn Seconds - Maximum time in seconds the training job can run before it is stopped.
- max
Wait IntegerTime In Seconds - Maximum time in seconds to wait for a managed spot training job to complete.
- max
Pending numberTime In Seconds - Maximum time in seconds a training job can be pending before it is stopped. Valid values: 7200–2419200.
- max
Runtime numberIn Seconds - Maximum time in seconds the training job can run before it is stopped.
- max
Wait numberTime In Seconds - Maximum time in seconds to wait for a managed spot training job to complete.
- max_
pending_ inttime_ in_ seconds - Maximum time in seconds a training job can be pending before it is stopped. Valid values: 7200–2419200.
- max_
runtime_ intin_ seconds - Maximum time in seconds the training job can run before it is stopped.
- max_
wait_ inttime_ in_ seconds - Maximum time in seconds to wait for a managed spot training job to complete.
- max
Pending NumberTime In Seconds - Maximum time in seconds a training job can be pending before it is stopped. Valid values: 7200–2419200.
- max
Runtime NumberIn Seconds - Maximum time in seconds the training job can run before it is stopped.
- max
Wait NumberTime In Seconds - Maximum time in seconds to wait for a managed spot training job to complete.
TrainingJobTensorBoardOutputConfig, TrainingJobTensorBoardOutputConfigArgs
- S3Output
Path string - S3 URI where TensorBoard output is stored.
- Local
Path string - Local path where TensorBoard output is written.
- S3Output
Path string - S3 URI where TensorBoard output is stored.
- Local
Path string - Local path where TensorBoard output is written.
- s3Output
Path String - S3 URI where TensorBoard output is stored.
- local
Path String - Local path where TensorBoard output is written.
- s3Output
Path string - S3 URI where TensorBoard output is stored.
- local
Path string - Local path where TensorBoard output is written.
- s3_
output_ strpath - S3 URI where TensorBoard output is stored.
- local_
path str - Local path where TensorBoard output is written.
- s3Output
Path String - S3 URI where TensorBoard output is stored.
- local
Path String - Local path where TensorBoard output is written.
TrainingJobTimeouts, TrainingJobTimeoutsArgs
- Create string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- Delete string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours). Setting a timeout for a Delete operation is only applicable if changes are saved into state before the destroy operation occurs.
- Update string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- Create string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- Delete string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours). Setting a timeout for a Delete operation is only applicable if changes are saved into state before the destroy operation occurs.
- Update string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- create String
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- delete String
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours). Setting a timeout for a Delete operation is only applicable if changes are saved into state before the destroy operation occurs.
- update String
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- create string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- delete string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours). Setting a timeout for a Delete operation is only applicable if changes are saved into state before the destroy operation occurs.
- update string
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- create str
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- delete str
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours). Setting a timeout for a Delete operation is only applicable if changes are saved into state before the destroy operation occurs.
- update str
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- create String
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
- delete String
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours). Setting a timeout for a Delete operation is only applicable if changes are saved into state before the destroy operation occurs.
- update String
- A string that can be parsed as a duration consisting of numbers and unit suffixes, such as "30s" or "2h45m". Valid time units are "s" (seconds), "m" (minutes), "h" (hours).
TrainingJobVpcConfig, TrainingJobVpcConfigArgs
- Security
Group List<string>Ids - List of VPC security group IDs. Maximum of 5.
- Subnets List<string>
- List of subnet IDs. Maximum of 16.
- Security
Group []stringIds - List of VPC security group IDs. Maximum of 5.
- Subnets []string
- List of subnet IDs. Maximum of 16.
- security
Group List<String>Ids - List of VPC security group IDs. Maximum of 5.
- subnets List<String>
- List of subnet IDs. Maximum of 16.
- security
Group string[]Ids - List of VPC security group IDs. Maximum of 5.
- subnets string[]
- List of subnet IDs. Maximum of 16.
- security_
group_ Sequence[str]ids - List of VPC security group IDs. Maximum of 5.
- subnets Sequence[str]
- List of subnet IDs. Maximum of 16.
- security
Group List<String>Ids - List of VPC security group IDs. Maximum of 5.
- subnets List<String>
- List of subnet IDs. Maximum of 16.
Import
Identity Schema
Required
trainingJobName- (String) Name of the Training Job.
Optional
accountId(String) AWS Account where this resource is managed.region(String) Region where this resource is managed.
Using pulumi import, import SageMaker AI Training Job using the trainingJobName. For example:
$ pulumi import aws:sagemaker/trainingJob:TrainingJob example my-training-job
To learn more about importing existing cloud resources, see Importing resources.
Package Details
- Repository
- AWS Classic pulumi/pulumi-aws
- License
- Apache-2.0
- Notes
- This Pulumi package is based on the
awsTerraform Provider.
published on Thursday, Apr 30, 2026 by Pulumi
