[{"data":1,"prerenderedAt":447},["ShallowReactive",2],{"project-droid-slam-hpc-port":3},{"id":4,"title":5,"body":6,"date":427,"description":428,"extension":429,"icon":430,"image":431,"live":432,"meta":433,"navigation":123,"path":434,"repo":432,"seo":435,"stem":436,"tags":437,"__hash__":446},"projects\u002Fprojects\u002Fdroid-slam-hpc-port.md","DROID-SLAM HPC Port",{"type":7,"value":8,"toc":414},"minimark",[9,14,25,29,32,66,70,77,82,261,265,280,284,295,299,306,352,367,371,374,381,384,388,403,407,410],[10,11,13],"h2",{"id":12},"overview","Overview",[15,16,17,24],"p",{},[18,19,23],"a",{"href":20,"rel":21},"https:\u002F\u002Fgithub.com\u002Fprinceton-vl\u002FDROID-SLAM",[22],"nofollow","DROID-SLAM"," is a deep-learning-based visual odometry system developed at Princeton. During my research internship at the University of Cape Town (UCT), I was tasked with making it run reliably on UCT's HPC cluster and building tooling to make the whole workflow manageable from a local machine.",[10,26,28],{"id":27},"the-challenges","The Challenges",[15,30,31],{},"Running deep-learning pipelines on an HPC cluster is rarely plug-and-play:",[33,34,35,48,54,60],"ul",{},[36,37,38,42,43,47],"li",{},[39,40,41],"strong",{},"No X server"," — the cluster has no display server. DROID-SLAM's built-in visualiser cannot run; ",[44,45,46],"code",{},"--disable_vis"," is mandatory, which meant the standard reconstruction export path was broken",[36,49,50,53],{},[39,51,52],{},"No outbound internet on compute nodes"," — all dependencies and container images must be pre-staged",[36,55,56,59],{},[39,57,58],{},"SLURM job queues"," — GPU jobs are submitted as batch scripts with no interactive debugging",[36,61,62,65],{},[39,63,64],{},"Dependency isolation"," — DROID-SLAM requires specific PyTorch and CUDA extension versions incompatible with the cluster's global modules",[10,67,69],{"id":68},"droidslamcli-the-go-cli","DROIDSLAMCLI — The Go CLI",[15,71,72,73,76],{},"The core deliverable was a full ",[39,74,75],{},"Go CLI application"," built with the Cobra and Viper frameworks. It wraps the entire DROID-SLAM workflow — validating inputs, templating SLURM scripts, uploading data to the cluster, submitting jobs, monitoring progress, and pulling results back — all over SSH\u002FSFTP from a local machine.",[78,79,81],"h3",{"id":80},"commands","Commands",[83,84,89],"pre",{"className":85,"code":86,"language":87,"meta":88,"style":88},"language-bash shiki shiki-themes github-light github-dark","# Submit an inference job\ndroidslamcli infer --config config.yaml\n\n# Submit a training job\ndroidslamcli train --config config.yaml\n\n# Check running jobs\ndroidslamcli status\n\n# Stream stdout from a specific job\ndroidslamcli status --jobID=\u003CjobID>\n\n# Extract results for a specific job\ndroidslamcli extract infer --jobId=\u003CjobID> --location=.\u002Fresults --config config.yaml\n\n# Extract results for all jobs\ndroidslamcli extract infer -a --location=.\u002Fresults --config config.yaml\n","bash","",[44,90,91,100,118,125,131,143,148,154,162,167,173,194,199,205,232,237,243],{"__ignoreMap":88},[92,93,96],"span",{"class":94,"line":95},"line",1,[92,97,99],{"class":98},"sJ8bj","# Submit an inference job\n",[92,101,103,107,111,115],{"class":94,"line":102},2,[92,104,106],{"class":105},"sScJk","droidslamcli",[92,108,110],{"class":109},"sZZnC"," infer",[92,112,114],{"class":113},"sj4cs"," --config",[92,116,117],{"class":109}," config.yaml\n",[92,119,121],{"class":94,"line":120},3,[92,122,124],{"emptyLinePlaceholder":123},true,"\n",[92,126,128],{"class":94,"line":127},4,[92,129,130],{"class":98},"# Submit a training job\n",[92,132,134,136,139,141],{"class":94,"line":133},5,[92,135,106],{"class":105},[92,137,138],{"class":109}," train",[92,140,114],{"class":113},[92,142,117],{"class":109},[92,144,146],{"class":94,"line":145},6,[92,147,124],{"emptyLinePlaceholder":123},[92,149,151],{"class":94,"line":150},7,[92,152,153],{"class":98},"# Check running jobs\n",[92,155,157,159],{"class":94,"line":156},8,[92,158,106],{"class":105},[92,160,161],{"class":109}," status\n",[92,163,165],{"class":94,"line":164},9,[92,166,124],{"emptyLinePlaceholder":123},[92,168,170],{"class":94,"line":169},10,[92,171,172],{"class":98},"# Stream stdout from a specific job\n",[92,174,176,178,181,184,188,191],{"class":94,"line":175},11,[92,177,106],{"class":105},[92,179,180],{"class":109}," status",[92,182,183],{"class":113}," --jobID=",[92,185,187],{"class":186},"szBVR","\u003C",[92,189,190],{"class":113},"jobID",[92,192,193],{"class":186},">\n",[92,195,197],{"class":94,"line":196},12,[92,198,124],{"emptyLinePlaceholder":123},[92,200,202],{"class":94,"line":201},13,[92,203,204],{"class":98},"# Extract results for a specific job\n",[92,206,208,210,213,215,218,220,222,225,228,230],{"class":94,"line":207},14,[92,209,106],{"class":105},[92,211,212],{"class":109}," extract",[92,214,110],{"class":109},[92,216,217],{"class":113}," --jobId=",[92,219,187],{"class":186},[92,221,190],{"class":113},[92,223,224],{"class":186},">",[92,226,227],{"class":113}," --location=.\u002Fresults",[92,229,114],{"class":113},[92,231,117],{"class":109},[92,233,235],{"class":94,"line":234},15,[92,236,124],{"emptyLinePlaceholder":123},[92,238,240],{"class":94,"line":239},16,[92,241,242],{"class":98},"# Extract results for all jobs\n",[92,244,246,248,250,252,255,257,259],{"class":94,"line":245},17,[92,247,106],{"class":105},[92,249,212],{"class":109},[92,251,110],{"class":109},[92,253,254],{"class":113}," -a",[92,256,227],{"class":113},[92,258,114],{"class":113},[92,260,117],{"class":109},[78,262,264],{"id":263},"ssh-sftp","SSH \u002F SFTP",[15,266,267,268,271,272,275,276,279],{},"The CLI communicates with the HPC cluster entirely over SSH. Commands like ",[44,269,270],{},"sbatch",", ",[44,273,274],{},"squeue",", and path setup are executed as remote shell commands, while files (images, calibration data, model weights) are transferred and results retrieved via SFTP. This meant the entire workflow — from job submission to results extraction — could be driven from a local Windows or macOS machine without needing to manually ",[44,277,278],{},"scp"," files or log in to the cluster.",[78,281,283],{"id":282},"singularity-instead-of-docker","Singularity instead of Docker",[15,285,286,287,290,291,294],{},"Most HPC clusters don't allow Docker because it requires daemon-level (root) access, which is a security risk in a shared multi-user environment. Instead, the cluster supports ",[39,288,289],{},"Singularity",", which runs containers as the current user with no elevated privileges. The CLI and SLURM runner scripts were built around Singularity — pulling the DROID-SLAM Docker image from DockerHub and converting it to a Singularity Image File (",[44,292,293],{},".sif",") on first use, then reusing the cached image for subsequent jobs.",[78,296,298],{"id":297},"inference-pipeline","Inference Pipeline",[15,300,301,302,305],{},"When ",[44,303,304],{},"droidslamcli infer"," is run:",[307,308,309,323,329,335,344],"ol",{},[36,310,311,314,315,318,319,322],{},[39,312,313],{},"Validates"," local input files — PNG images directory, ",[44,316,317],{},".txt"," calibration file, ",[44,320,321],{},".pth"," model weights",[36,324,325,328],{},[39,326,327],{},"Generates a UUID"," as the job ID for isolation",[36,330,331,334],{},[39,332,333],{},"Creates the remote directory structure"," and uploads calibration file, model weights, and all images via SFTP",[36,336,337,340,341],{},[39,338,339],{},"Templates"," the SLURM header and runner scripts with config values (account, time limit, GPU type, partition) using Go's ",[44,342,343],{},"text\u002Ftemplate",[36,345,346,349,350],{},[39,347,348],{},"Submits"," the job via ",[44,351,270],{},[15,353,354,355,358,359,362,363,366],{},"The SLURM runner script on the compute node then downloads the Singularity image, clones the ",[44,356,357],{},"headless_changes"," fork, compiles it inside the container, and executes inference with ",[44,360,361],{},"--save_headless --disable_vis",". Results are pulled back to the local machine via ",[44,364,365],{},"droidslamcli extract",".",[10,368,370],{"id":369},"forking-droid-slam-making-it-headless","Forking DROID-SLAM — Making it Headless",[15,372,373],{},"DROID-SLAM was never designed to run without a screen. Its visualiser — the part that shows you the 3D reconstruction being built in real time — and the code that actually saves the output were tangled together. On an HPC cluster there's no screen, so the visualiser crashes immediately, and with it, any chance of getting results out.",[15,375,376,377,380],{},"The fix was to fork the project and separate these two concerns. I wrote a new headless export module that does everything the visualiser does internally — reading the 3D data DROID-SLAM builds up during processing — but instead of drawing it on screen, it just saves the output straight to files: a point cloud of the reconstructed scene and a record of where the camera was at each moment. A new ",[44,378,379],{},"--save_headless"," flag was added so you could opt into this behaviour from the command line.",[15,382,383],{},"A few other papercuts were fixed along the way: the output path logic had a bug that was nesting results in an unintended subfolder, the trajectory data wasn't being saved at all, and the training pipeline was hardcoded to only work with one specific dataset format. That last fix meant the system could now be trained on custom-captured footage rather than being locked to a single public dataset.",[10,385,387],{"id":386},"results","Results",[33,389,390,397,400],{},[36,391,392,393,396],{},"Successfully ran DROID-SLAM inference on the ",[39,394,395],{},"TUM RGB-D"," dataset on the HPC cluster's A100 GPU nodes",[36,398,399],{},"Reproduced trajectory results matching the paper's benchmarks",[36,401,402],{},"The fork, CLI, and containerised workflow were documented and handed off for continued use by the UCT robotics research group",[10,404,406],{"id":405},"learnings","Learnings",[15,408,409],{},"This project touched a wide stack — SLURM job scheduling, Singularity containers, SSH automation in Go, and the internals of a deep-learning visual odometry system. The headless export problem was the most interesting challenge: to solve it I had to understand how DROID-SLAM builds up its 3D reconstruction internally and find a way to get that data out without any of the display infrastructure it was designed around.",[411,412,413],"style",{},"html pre.shiki code .sJ8bj, html code.shiki .sJ8bj{--shiki-default:#6A737D;--shiki-dark:#6A737D}html pre.shiki code .sScJk, html code.shiki .sScJk{--shiki-default:#6F42C1;--shiki-dark:#B392F0}html pre.shiki code .sZZnC, html code.shiki .sZZnC{--shiki-default:#032F62;--shiki-dark:#9ECBFF}html pre.shiki code .sj4cs, html code.shiki .sj4cs{--shiki-default:#005CC5;--shiki-dark:#79B8FF}html pre.shiki code .szBVR, html code.shiki .szBVR{--shiki-default:#D73A49;--shiki-dark:#F97583}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}",{"title":88,"searchDepth":102,"depth":102,"links":415},[416,417,418,424,425,426],{"id":12,"depth":102,"text":13},{"id":27,"depth":102,"text":28},{"id":68,"depth":102,"text":69,"children":419},[420,421,422,423],{"id":80,"depth":120,"text":81},{"id":263,"depth":120,"text":264},{"id":282,"depth":120,"text":283},{"id":297,"depth":120,"text":298},{"id":369,"depth":102,"text":370},{"id":386,"depth":102,"text":387},{"id":405,"depth":102,"text":406},"Jul 2023","Ported the DROID-SLAM visual-odometry framework to UCT's HPC environment and built a full Go CLI for executing, managing, and extracting inference and training jobs over SSH.","md","memory","\u002Fassets\u002Fproj-hpc.jpg",null,{},"\u002Fprojects\u002Fdroid-slam-hpc-port",{"title":5,"description":428},"projects\u002Fdroid-slam-hpc-port",[438,439,440,441,442,443,444,289,445],"Python","Docker","SLAM","Linux","Go","HPC","SLURM","SSH","55JWuPR6RHe4i6KSYWk5r0hielir1iTSCO67WfHnngE",1780176250363]