Proper way to utilize >1 GPUs in a single machine?

@thealanjason my current solution for 4GPUS with docker is as follows, the idea is assigning a GPU ID for each job, and having a separate process for each GPU which runs it’s assigned jobs:

// Define 4 "identical" processes, one for each GPU ID
process run_my_process_0 {
    containerOptions = "--cpus=${params.cpus} --gpus '\"device=0\"' "
    maxForks = 1
    ...
}

process run_my_process_1 {
    containerOptions = "--cpus=${params.cpus} --gpus '\"device=1\"' "
    maxForks = 1
    ...
}

process run_my_process_2 {
    containerOptions = "--cpus=${params.cpus} --gpus '\"device=2\"' "
    maxForks = 1
    ...
}

process run_my_process_3 {
    containerOptions = "--cpus=${params.cpus} --gpus '\"device=3\"' "
    maxForks = 1
    ...
}

workflow run_my_process_channels {
    take:
    input_set

    main:
    // Branch each job depending on GPU assignment
    input_set.branch { v ->
        id0: v[2] == 0
        id1: v[2] == 1
        id2: v[2] == 2
        id3: v[2] == 3
    }
    .set { jobs }

    // Run inputs from each branch on the corresponding GPU-ID process
    run_my_process_0(jobs.id0.map{it[0]})
    run_my_process_1(jobs.id1.map{it[0]})
    run_my_process_2(jobs.id2.map{it[0]})
    run_my_process_3(jobs.id3.map{it[0]})
}

workflow {
    // Assign a job ID, use the modulus to generate a GPU assignment
    def job_id = 0
    new_inputs = Channel
        .fromPath("${params.input}/*.txt")
        .map{ v -> [v, ++job_id]}
        .map{ v -> [v[0], v[1], v[1] % 4]}

    run_my_process_channels(new_inputs)
}