diff --git a/CHANGELOG.md b/CHANGELOG.md index c203de8d5..237465f0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## Unreleased + +### Fixes +* Fixes Multi-node tasks defined in job file. + ## v0.25.0 ### Breaking change diff --git a/crates/hyperqueue/src/client/commands/submit/command.rs b/crates/hyperqueue/src/client/commands/submit/command.rs index 1774ce989..c1fb4e74d 100644 --- a/crates/hyperqueue/src/client/commands/submit/command.rs +++ b/crates/hyperqueue/src/client/commands/submit/command.rs @@ -695,8 +695,6 @@ pub async fn submit_computation( .unwrap_or_else(|| "job".to_string()) }; - // Force task_dir for multi node tasks (for a place where to create node file) - let task_dir = task_dir | (resources.n_nodes > 0); let resources = ResourceRequestVariants::new(smallvec![resources]); let args: Vec = commands.into_iter().map(|arg| arg.into()).collect(); diff --git a/crates/hyperqueue/src/worker/start/program.rs b/crates/hyperqueue/src/worker/start/program.rs index 39392cbde..37523479f 100644 --- a/crates/hyperqueue/src/worker/start/program.rs +++ b/crates/hyperqueue/src/worker/start/program.rs @@ -86,7 +86,7 @@ pub(super) fn build_program_task( pin_program(&mut program, build_ctx.allocation(), pin_mode, &build_ctx)?; - let task_dir = if task_dir { + let task_dir = if task_dir || !build_ctx.node_list().is_empty() { let task_dir = TempDir::with_prefix_in("t", &build_ctx.worker_configuration().work_dir) .map_err(|error| { format!( diff --git a/tests/test_jobfile.py b/tests/test_jobfile.py index 605433b35..3f803e030 100644 --- a/tests/test_jobfile.py +++ b/tests/test_jobfile.py @@ -440,3 +440,23 @@ def test_job_file_stream(hq_env: HqEnv, tmp_path): wait_for_job_state(hq_env, 1, "FINISHED") result = hq_env.command(["output-log", "output", "cat", "1", "stdout"]) assert result == "Hello\n" + + +def test_job_file_multinode(hq_env: HqEnv, tmp_path): + hq_env.start_server() + hq_env.start_worker() + hq_env.start_worker() + tmp_path.joinpath("job.toml").write_text( + """ + [[task]] + id = 0 + command = ["bash", "-c", "echo ${HQ_NUM_NODES}; cat ${HQ_NODE_FILE}"] + [[task.request]] + n_nodes = 2 + """ + ) + hq_env.command(["job", "submit-file", "job.toml"]) + wait_for_job_state(hq_env, 1, "FINISHED") + with open(default_task_output(1)) as f: + lines = sorted(f.read().rstrip().split("\n")) + assert lines == ["2", "worker1", "worker2"]