{"id":"0790995368","title":"Forward Deployed Engineer (GPU Clusters)","posted_at":"2026-05-07T22:46:42.000Z","apply_url":"https://job-boards.greenhouse.io/togetherai/jobs/5121371007","locations":["San Francisco, CA"],"employment_type":"full_time","workplace_type":null,"seniority_level":"mid","description_language":"en","source_name":"greenhouse","source_url":"https://job-boards.greenhouse.io/togetherai/jobs/5121371007","salary":{"min":270000,"max":300000,"currency":"USD","period":"year","display":"$270,000–$300,000"},"job_summary":"Together AI is a research-driven artificial intelligence company focused on lowering the cost of modern AI systems. This Forward Deployed Engineer role acts as a technical partner to strategic customers, specializing in large-scale GPU infrastructure, cluster orchestration, and performance optimization.","job_description":{"responsibilities":["Design and execute rigorous pre-handover test suites for cluster stability","Act as the primary technical point of contact for model labs to tune orchestration layers","Profile and debug low-level bottlenecks in networking and storage systems","Build reference designs and out-of-the-box configurations for training frameworks","Lead complex benchmarking exercises to demonstrate performance impacts","Influence hardware and software roadmaps by surfacing performance gaps"],"minimum_qualifications":["5+ years in a technical role with a focus on Large-Scale GPU Infrastructure","Hands-on experience with Kubernetes or SLURM","Expert knowledge of InfiniBand, RoCE, and NVLink","Familiarity with parallel file systems and object storage","Ability to run and interpret training benchmarks","Proficiency in Python, shell scripting, and automation tools like Ansible"],"preferred_qualifications":["Experience with GPU-operator and device plugins","Familiarity with VAST or Weka parallel file systems"]},"visa_sponsorship":null,"experience_years_min":5,"job_address":null,"job_city":"San Francisco","job_state":"CA","job_country":"US","location_lat":37.7667845,"location_lng":-122.4025894,"keywords":["infrastructure","communication","intelligence","large-scale","performance","competitive","engineering","algorithms","Kubernetes","automation","onboarding","datasets","hardware","Platform","roadmaps","customer","research","leading","privacy","design","Growth","Python","debug","Sales","GPU","AI","CX"],"company":{"name":"Together AI","logo_url":"https://img.logo.dev/together.ai?token=pk_fWx5G5QrQMm-0Ud8BW3mBg&size=64&format=png","description":"Together AI builds and operates high-performance inference and reinforcement learning systems designed to make large-scale AI models faster, cheaper, and more capable.","website_url":"https://together.ai","linkedin_url":"https://www.linkedin.com/company/togethercomputer","glassdoor_url":null,"x_url":"https://x.com/togethercompute","instagram_url":null,"youtube_url":"https://www.youtube.com/@togetherdotai","github_url":"https://github.com/togethercomputer","huggingface_url":"https://huggingface.co/togethercomputer","tiktok_url":null,"crunchbase_url":null,"facebook_url":null,"employee_count_range":"201-500","employee_count":335,"founded_year":2022,"headquarters":{"address":"251 Rhode Island Street Suite 205, San Francisco, CA 94103, United States","city":"San Francisco, CA","country":"US","lat":37.7879363,"lng":-122.4075201},"industry":"other","company_type":"startup","total_funding_usd":534000000,"locations":["Amsterdam","Amsterdam, Netherlands","London, United Kingdom","San Francisco, CA"]}}