<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>https://niyunsheng.github.io/tech/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/bank-conflict/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/categories/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/cuda/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/gemm/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/categories/kernels/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/progressive-cuda-gemm-optimization-from-memory-bound-to-swizzling/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/shared-memory/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/swizzling/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/</loc><lastmod>2026-03-22T08:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/cp/</loc><lastmod>2026-03-21T15:41:55+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/categories/distributed-system/</loc><lastmod>2026-03-21T15:41:55+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/dp/</loc><lastmod>2026-03-21T15:41:55+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/llm/</loc><lastmod>2026-03-21T15:41:55+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/loss-reduction-in-distributed-training/</loc><lastmod>2026-03-21T15:41:55+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/video-generation/</loc><lastmod>2026-03-21T15:41:55+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/computing-global-gradient-norm-in-distributed-training-tp-dp_shard-dp_replicate-ep-and-pp/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/ep/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/fsdp/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/gradient-clipping/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/hsdp/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/pp/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/tp/</loc><lastmod>2026-03-21T07:02:02+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/demystifying-flashattention-forward-backward-and-triton-implementation/</loc><lastmod>2026-03-15T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/flash-attention/</loc><lastmod>2026-03-15T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/triton/</loc><lastmod>2026-03-15T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/dit/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/flow-matching/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/rope/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/series/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/the-devil-in-the-details-engineering-tricks-for-sota-video-models/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/training-stability/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/categories/video-generation/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/series/video-generation-theory/</loc><lastmod>2026-02-25T22:00:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/deep-dive-into-triton-gemm-optimization-from-naive-tiling-to-hopper-tma/</loc><lastmod>2026-02-11T00:39:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/swizzing/</loc><lastmod>2026-02-11T00:39:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/tiling/</loc><lastmod>2026-02-11T00:39:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/tma/</loc><lastmod>2026-02-11T00:39:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/flops/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/h200/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/mbu/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/mfu/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/recomputation/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/roofline/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/roofline-analysis-of-llms-on-h200-performance-modeling-and-recomputation-strategies/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/categories/system-optimization/</loc><lastmod>2026-02-09T23:20:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/classifier-free-guidance/</loc><lastmod>2026-02-03T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/ddpm/</loc><lastmod>2026-02-03T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/from-ddpm-to-flow-matching-the-evolution-of-generative-trajectories/</loc><lastmod>2026-02-03T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/from-dit-to-hunyuan-the-evolution-of-adaln-zero-in-generative-models/</loc><lastmod>2026-02-02T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/beyond-theoretical-flops-analyzing-mfu-hfu-and-attention-overhead-in-transformers/</loc><lastmod>2026-01-31T09:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/hfu/</loc><lastmod>2026-01-31T09:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/series/3d-sparse-attention/</loc><lastmod>2026-01-26T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/sparse-attention/</loc><lastmod>2026-01-26T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/visualization/</loc><lastmod>2026-01-26T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/visualizing-3d-attention-bridging-the-gap-between-1d-sequences-and-3d-space/</loc><lastmod>2026-01-26T23:30:00+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/a100/</loc><lastmod>2026-01-25T23:21:35+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/b300/</loc><lastmod>2026-01-25T23:21:35+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/gpu-network-constants/</loc><lastmod>2026-01-25T23:21:35+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/categories/hardware/</loc><lastmod>2026-01-25T23:21:35+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/infiniband/</loc><lastmod>2026-01-25T23:21:35+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/tags/nvidia/</loc><lastmod>2026-01-25T23:21:35+08:00</lastmod></url><url><loc>https://niyunsheng.github.io/archives/</loc></url><url><loc>https://niyunsheng.github.io/explore/</loc></url><url><loc>https://niyunsheng.github.io/guestbook/</loc></url><url><loc>https://niyunsheng.github.io/search/</loc></url></urlset>