How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

angles = np.linspace(0.0, 2.0 * np.pi, n_particles, endpoint=False, dtype=np.float32)
px0_np = 0.4 * np.cos(angles).astype(np.float32)
py0_np = (0.7 + 0.15 * np.sin(angles)).astype(np.float32)
vx0_np = (-0.8 * np.sin(angles)).astype(np.float32)
vy0_np = (0.8 * np.cos(angles)).astype(np.float32)

px0_wp = wp.array(px0_np, dtype=wp.float32, device=device)
py0_wp = wp.array(py0_np, dtype=wp.float32, device=device)
vx0_wp = wp.array(vx0_np, dtype=wp.float32, device=device)
vy0_wp = wp.array(vy0_np, dtype=wp.float32, device=device)

state_size = (steps + 1) * n_particles
px_wp = wp.empty(state_size, dtype=wp.float32, device=device)
py_wp = wp.empty(state_size, dtype=wp.float32, device=device)
vx_wp = wp.empty(state_size, dtype=wp.float32, device=device)
vy_wp = wp.empty(state_size, dtype=wp.float32, device=device)

wp.launch(
kernel=init_particles_kernel,
dim=n_particles,
inputs=[n_particles, px0_wp, py0_wp, vx0_wp, vy0_wp],
outputs=[px_wp, py_wp, vx_wp, vy_wp],
device=device,
)

wp.launch(
kernel=simulate_particles_kernel,
dim=steps * n_particles,
inputs=[n_particles, dt, gravity, damping, bounce, radius],
outputs=[px_wp, py_wp, vx_wp, vy_wp],
device=device,
)
wp.synchronize()

px_traj = px_wp.numpy().reshape(steps + 1, n_particles)
py_traj = py_wp.numpy().reshape(steps + 1, n_particles)

sample_ids = np.linspace(0, n_particles – 1, 16, dtype=int)
plt.figure(figsize=(8, 6))
for idx in sample_ids:
plt.plot(px_traj[:, idx], py_traj[:, idx], linewidth=1.5)
plt.axhline(radius, linestyle=”–“)
plt.xlim(-1.05, 1.05)
plt.ylim(0.0, 1.25)
plt.title(f”Warp particle trajectories on {device}”)
plt.xlabel(“x”)
plt.ylabel(“y”)
plt.show()

proj_steps = 180
proj_dt = np.float32(0.025)
proj_g = np.float32(-9.8)
target_x = np.float32(3.8)
target_y = np.float32(0.0)

vx_value = np.float32(2.0)
vy_value = np.float32(6.5)
lr = 0.08
iters = 60

loss_history = []
vx_history = []
vy_history = []

for it in range(iters):
init_vx_wp = wp.array(np.array([vx_value], dtype=np.float32), dtype=wp.float32, device=device, requires_grad=True)
init_vy_wp = wp.array(np.array([vy_value], dtype=np.float32), dtype=wp.float32, device=device, requires_grad=True)

x_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
y_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
vx_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
vy_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
loss_wp = wp.zeros(1, dtype=wp.float32, device=device, requires_grad=True)

tape = wp.Tape()
with tape:
wp.launch(
kernel=init_projectile_kernel,
dim=1,
inputs=[],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp, init_vx_wp, init_vy_wp],
device=device,
)
wp.launch(
kernel=projectile_step_kernel,
dim=proj_steps,
inputs=[proj_dt, proj_g],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp],
device=device,
)
wp.launch(
kernel=projectile_loss_kernel,
dim=1,
inputs=[proj_steps, target_x, target_y],
outputs=[x_hist_wp, y_hist_wp, loss_wp],
device=device,
)

tape.backward(loss=loss_wp)
wp.synchronize()

current_loss = float(loss_wp.numpy()[0])
grad_vx = float(init_vx_wp.grad.numpy()[0])
grad_vy = float(init_vy_wp.grad.numpy()[0])

vx_value = np.float32(vx_value – lr * grad_vx)
vy_value = np.float32(vy_value – lr * grad_vy)

loss_history.append(current_loss)
vx_history.append(float(vx_value))
vy_history.append(float(vy_value))

if it % 10 == 0 or it == iters – 1:
print(f”iter={it:02d} loss={current_loss:.6f} vx={vx_value:.4f} vy={vy_value:.4f} grad=({grad_vx:.4f}, {grad_vy:.4f})”)

final_init_vx_wp = wp.array(np.array([vx_value], dtype=np.float32), dtype=wp.float32, device=device)
final_init_vy_wp = wp.array(np.array([vy_value], dtype=np.float32), dtype=wp.float32, device=device)
x_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
y_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
vx_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
vy_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)

wp.launch(
kernel=init_projectile_kernel,
dim=1,
inputs=[],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp, final_init_vx_wp, final_init_vy_wp],
device=device,
)
wp.launch(
kernel=projectile_step_kernel,
dim=proj_steps,
inputs=[proj_dt, proj_g],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp],
device=device,
)
wp.synchronize()

x_path = x_hist_wp.numpy()
y_path = y_hist_wp.numpy()

fig = plt.figure(figsize=(15, 4))

ax1 = fig.add_subplot(1, 3, 1)
ax1.plot(loss_history)
ax1.set_title(“Optimization loss”)
ax1.set_xlabel(“Iteration”)
ax1.set_ylabel(“Squared distance”)

ax2 = fig.add_subplot(1, 3, 2)
ax2.plot(vx_history, label=”vx”)
ax2.plot(vy_history, label=”vy”)
ax2.set_title(“Learned initial velocity”)
ax2.set_xlabel(“Iteration”)
ax2.legend()

ax3 = fig.add_subplot(1, 3, 3)
ax3.plot(x_path, y_path, linewidth=2)
ax3.scatter([target_x], [target_y], s=80, marker=”x”)
ax3.set_title(“Differentiable projectile trajectory”)
ax3.set_xlabel(“x”)
ax3.set_ylabel(“y”)
ax3.set_ylim(-0.1, max(1.0, float(np.max(y_path)) + 0.3))

plt.tight_layout()
plt.show()

final_dx = float(x_path[-1] – target_x)
final_dy = float(y_path[-1] – target_y)
final_dist = math.sqrt(final_dx * final_dx + final_dy * final_dy)
print(f”Final target miss distance: {final_dist:.6f}”)
print(f”Optimized initial velocity: vx={vx_value:.6f}, vy={vy_value:.6f}”)

Source link

How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

You may also like