Files
composable_kernel/include/ck_tile/host/reference/reference_gemm.hpp
Yi DING c28e65c6bf [CK_TILE] Add mxfp4 flatmm (#3080)
* Squashed commit of the following:

commit 3e1a851dad834776efbe4fe365ac82c4ed312010
Author: Ding, Yi <yi.ding@amd.com>
Date:   Thu Oct 23 06:10:54 2025 +0000

    Fix & clean after rebase

commit 1edf485092f44411da9a1796a4a6b72d5cdb67c6
Author: Ding, Yi <yi.ding@amd.com>
Date:   Wed Oct 22 10:46:13 2025 +0000

    Squashed commit of the following:

    commit 5276b28a51dac7b5d2106fbae8e78de190ee0de1
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 22 02:04:27 2025 -0500

        fix bandwidth calculation

    commit d645bb20c6d879154c30ecd82bbff4d2a9206750
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 22 00:58:59 2025 -0500

        updates

    commit 0fa7e6b88aaf81a36034aa7607746de295de4263
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Fri Sep 19 00:39:46 2025 -0500

        fix a bug, set the A DS_read preload size to 4 for MXFP4

    commit 50cafa824e2267f2b2f0dfeeb93e69a673630c61
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Thu Sep 18 01:19:03 2025 -0500

        fix a_wrap preload issue for large MPerBlock.

    commit e6333bbbc6ef540e24f92095040085f1ed59041e
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Wed Sep 17 21:34:03 2025 -0500

        optimized the VGPR repack issue for MXFP4

    commit e99e4932c401b9f6d1893dd5044c2827d6b3f145
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Wed Sep 17 04:19:44 2025 -0500

        fix time error

    commit 4586ce6da7fba0514f2e01a8124c76b7d494e124
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Wed Sep 17 03:58:00 2025 -0500

        updated, function passed.

    commit c4f25e7579573db5681b9160f6bdb1349f3566f1
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Tue Sep 16 22:21:39 2025 -0500

        fix, function partially passed

    commit a51b56eb6b00b99a4e8d2802dbf5b5b5277b54d8
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Tue Sep 16 03:01:12 2025 -0500

        fix, reference function passed, next check kernel function

    commit 5b02643ebab18960e8f9ba66c6bd2f91774f9cae
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Tue Sep 16 02:29:01 2025 -0500

        let pack/unpack return pk_fp4_t

    commit 76d37c5d4b17530e95c6fced31bff66a35d54b8f
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 15 20:50:26 2025 -0500

        fix

    commit e5be3e162b9a20e5355bd556d2b27afb6d8bf085
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Mon Sep 15 05:51:06 2025 -0500

        fix bug

    commit 39a024efe4aa773df589712b1290803bb5ab5d1d
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 15 04:02:05 2025 -0500

        fix core dump issue, function is not correct.

    commit 16c49d268cfe065b5112b960b2d852b26552686a
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 15 03:03:02 2025 -0500

        updates, build pass

    commit fe7a961852dee6eff3be3cf1e0d0fabec5cd42ee
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 15 00:05:18 2025 -0500

        updates

    commit aaf9fe8022a72df59e04e4d5886dca3ba9c23400
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Sun Sep 14 23:40:28 2025 -0500

        fix bug

    commit a3da89290e1553b85fbf1171c07e93ac0f5584db
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Fri Sep 12 03:28:50 2025 -0500

        fix interface

    commit c5ff747e72d877461ba61dc19a0fe15527d3161e
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Fri Sep 12 02:53:50 2025 -0500

        add interface in warp_gemm_impl

    commit 0a48d369e601cc798589fc59e0784bdbfc0a22f9
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Wed Sep 10 05:03:08 2025 -0500

        updates some fixes.

    commit aaa2beca30ff5546d171a2028d1894fd4e131d4e
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Tue Sep 9 04:37:42 2025 -0500

        fix after merge ginolu/add_wgmfma_dispatcher

    commit bf87449b09cba690922b2f3f78ba39bf1b1e472e
    Merge: 05ab58e3d 991d7fdbb
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 8 22:09:15 2025 -0500

        Merge remote-tracking branch 'origin/ginolu/add_wgmfma_dispatcher' into mtgu/cktile_mxfp4_flatmm_dev

    commit 05ab58e3de2b708aceda63d704089c0fa89437ae
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 8 21:42:47 2025 -0500

        update mx flatmm tail pipeline

    commit 991d7fdbb726d65091a91b5cc2800f798a6661fc
    Merge: ad046084a 41ee8fe31
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Mon Sep 8 19:10:23 2025 -0500

        Merge branch 'develop' into ginolu/add_wgmfma_dispatcher

    commit ad046084a2f6e4ebf0cd8b47d0d72b74815061fa
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Mon Sep 8 19:09:55 2025 -0500

        fix type error

    commit 42e16b43a035364a42789d7ce45a1e6a7d1d2609
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Mon Sep 8 04:01:40 2025 -0500

        update hotloop pipeline

    commit c2f69745346545087c8ce24acaba2961bb93ef0b
    Merge: adbeeb90b 91db4cec3
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Fri Sep 5 04:22:26 2025 -0500

        Merge branch 'develop' into ginolu/add_wgmfma_dispatcher

    commit adbeeb90be1533f8aeb8c1d5aea6470d45a455a0
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Fri Sep 5 04:21:26 2025 -0500

        fix clang format

    commit e2378ac393bb79ac80a8eef84677bffce86d9e0a
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Wed Sep 3 10:00:54 2025 -0500

        some updates

    commit bdc18a2269db49ff88e1ef1db30f83ea430d7544
    Merge: 6c5cea2b7 b3886a6d8
    Author: asleepzzz <hanwen.chang@amd.com>
    Date:   Wed Sep 3 13:22:03 2025 +0800

        Merge branch 'develop' into ginolu/add_wgmfma_dispatcher

    commit 6c5cea2b7a306f5d0ad346cb9baf6370ea2a73fe
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Mon Sep 1 02:11:02 2025 -0500

        fix vec size error

    commit 76d1dfa352087dfd5867c8909b73726d3a1e853e
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Mon Sep 1 01:23:39 2025 -0500

        fix format error

    commit a9061aaa1b4bfaa9db102c75b9d74863f39708a9
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Sat Aug 30 03:19:07 2025 -0500

        update codes

    commit 0caa184a271a8824ef40f87de456d0fa2500c8ad
    Author: mtgu0705 <mtgu@amd.com>
    Date:   Fri Aug 29 11:27:33 2025 -0500

        init ck_tile mxfp4 flatmm

    commit 5d46a6635f04bd69b76f7eda1438862e271b987a
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Thu Aug 28 08:02:50 2025 +0000

        Add bias for f16xf4 moe_flatmm

    commit dd112dc302d17f541737671a3ac557d7c09ff969
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 27 13:39:47 2025 +0000

        update case construction

    commit b1aca68a073d82c7b3c7bb53286e5f415999edc1
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Tue Aug 26 12:32:29 2025 +0000

        support swiglu activaion and use rcpf to accelerate silu

    commit 49235bd42349a84fc2ebd7ad0b100cc2545bb80a
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Tue Aug 26 02:33:55 2025 -0500

        first commit

    commit c169e39d6381b932cf7098cc118db29df91da1cb
    Author: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu>
    Date:   Fri Aug 22 04:01:59 2025 -0500

        add line to last

    commit 318f9bf317306454941bbf394c1940023edcf0ac
    Author: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu>
    Date:   Fri Aug 22 03:20:46 2025 -0500

        adjust A_LDS descriptor to avoid bankconflict

    commit 9d066120ed068d6d102da25d619e170a28a04d18
    Author: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu>
    Date:   Thu Aug 21 09:46:52 2025 -0500

        enable hotloop

    commit 61a895e6b821798970afffd0e9432a21e2f04df8
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Thu Aug 21 09:12:21 2025 +0000

        support atomic_pk_add_bf16 on gfx950

    commit 9f14864e45f21d8c1bc70a94988fb86c2c0017d8
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Thu Aug 21 06:58:55 2025 +0000

        use int64_t as expert stride to avoid overflow

    commit e63af46b32e1139a1e59dee6f46b9971047c4026
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 20 13:53:32 2025 +0000

        use v4i32 as the storage type for B to avoid repack operation

    commit 6cf0224dd8a229bf2be726ca861c736c9b5f5415
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 20 06:40:03 2025 +0000

        add pk_fp4_t and e8m0_t support for amd_buffer_load_impl

    commit 67a591f2240b0b035029edad904627f98b3839fd
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 20 04:39:14 2025 +0000

        optimize cvt_pkf4_to_f16 implementation

    commit 51c7126e77e9b17af694eaa57040e487f9d443e8
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Tue Aug 19 14:56:46 2025 +0000

        optimize A_LDS descriptor to avoid bankconflict

    commit c113160f326353290a2878d7b8febf7daed91d71
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 18 18:43:37 2025 +0000

        fix gate-up when GU_NRepeat > 1

    commit a45ca0e9934ca4bb9114f65621d5c9582d937a45
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 18 17:28:11 2025 +0000

        add fp16xf4 moe

    commit dc8c8e484804f7bca10c8f0764540af3b5884e83
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Sun Aug 17 17:51:18 2025 +0000

        rename example

    commit b177c967141cfdc401d3f36bf17830fe99893600
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Fri Aug 15 06:20:46 2025 +0000

        remove additional check when e8m0->float

    commit d467f9688c3d35f391e15089135edb1ad1d38b05
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Thu Aug 14 09:34:12 2025 +0000

        eliminate repeat dequant

    commit 1b20674b26ab3ce6bd2f710dd729fd4cc0f79428
    Merge: faa3c0278 7d02625e7
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 13 16:51:49 2025 +0000

        Merge remote-tracking branch 'origin/moe_flatmm' into feat-mixed_input_flatmm

    commit faa3c0278cf11b7105a4302dea3a4416520b2cc7
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 13 16:16:48 2025 +0000

        update f16xMXF4

    commit a2a2e1dab05501cc2136133236c01c08d51db4ea
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 13 10:48:53 2025 +0000

        update scale-preshuffle for MXF4

    commit eac9667feb899419dda1628164c092b969852660
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 11 11:24:34 2025 +0000

        update

    commit 7d02625e7678882af653f52c2a4ddaf64568a41c
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 11 08:38:23 2025 +0000

        optimize gemm2 atomic_add pattern

    commit d5f3c3e3ec72d0e6739467c4dc0b4e209f6d1192
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 11 07:59:47 2025 +0000

        update scale for mxfp4

    commit 15db198084614466bd4cfd4943fcb549cab2069a
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 11 07:56:14 2025 +0000

        update case construction

    commit 5dff349d82a5f70b6eea821d2622df51f90ef200
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 11 06:03:06 2025 +0000

        update granularity control

    commit d32cdc52144f65ec473f4ec8e45ea23968811184
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 11 03:42:46 2025 +0000

        fix TileConfig

    commit 26f38c5716304ee5f84e5c4f6f88144d9f3dddaf
    Author: Gino Lu <gino.lu@amd.com>
    Date:   Thu Aug 7 21:37:28 2025 +0800

        Add e8m0 scaled convert into CK_TILE (#2617)

        * first commit

        * remove redundent code

        * modify according to comments.

        * fix type_convert error with scaled_type_convert

    commit 419041478745f65dfec18859e75a13d975089519
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Fri Aug 8 20:19:16 2025 +0000

        add mixed_prec fp16xfp4

    commit 92e2a8b0308b9b107df9d2fd63a961efce706402
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Thu Aug 7 09:22:04 2025 +0000

        debug mixed_prec flatmm

    commit dea3ce80496ebcb00512979f0c3bb897f25e11a5
    Merge: fde443bc3 b4f45fe14
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Wed Aug 6 16:49:47 2025 +0800

        Merge pull request #2626 from ROCm/felix/flatmm_fix_splitk

        fix split k

    commit d480e8150358cc4ef8b05e25afe299141fad4fde
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Aug 6 08:33:33 2025 +0000

        add moe_flatmm

    commit b4f45fe14d11569f34de40c8a205cd6760b61357
    Author: coderfeli <coderfeli@163.com>
    Date:   Wed Aug 6 02:45:31 2025 +0000

        fix split k

    commit fde443bc38fe60e52195817ecb2c7b20d772eedb
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Aug 4 07:16:36 2025 +0000

        fix flatmm with scaling when WarpTileM == 32

    commit 5a0667afa889a5af8c6b8509232eabd50cf5efef
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Fri Aug 1 11:01:23 2025 +0000

        optimize scaling epilogue

    commit 5c3502bbf71833c6f6f7d4a1cc4f4fd93811f522
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Fri Aug 1 07:28:38 2025 +0000

        fix wrong config for fp8 scaling

    commit eb2d0653cdb86603cb11539cbac466b6431b58b7
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Jul 30 06:20:30 2025 +0000

        prune debug message

    commit 0c089cb56343a39e02a1ee38e9cabeb71ba35e92
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Jul 30 04:52:08 2025 +0000

        fix compile error

    commit 61759ca30ce3787f70e228c3919b3e4d354016dd
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Tue Jul 29 15:42:58 2025 +0000

        Add persistent option on flatmm for tuning

    commit b36dc5dd55f15fc1ce8eb21637bdec862e56a883
    Author: AMD-dteng <dteng@amd.com>
    Date:   Tue Jul 29 22:48:00 2025 +0800

        update pipeline v1: add atomic IGLP schedule

    commit f886f26994454fc2b4fc3433c86bf699767a2a7c
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Thu Jul 24 09:09:27 2025 +0000

        fix error log throwing

    commit 4b4686ab144daa9061fbda17f3df4c17600c8e9a
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Mon Jul 28 08:24:51 2025 +0000

        crz idea

    commit 7099af44a81be41431ba70ae60827b60116d02d2
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Sun Jul 27 11:57:38 2025 +0000

        Add permuteN optimzization when NRepeat % 2 == 0 on flatmm

    commit b147524c92e69a267337c8e48b6e64bcb1483551
    Author: sjfeng <j514681085@icloud.com>
    Date:   Sun Jul 27 17:24:08 2025 +0800

        try to remove c_shuffle_lds

    commit 2dd94f59d1a7740a5689e1713ed45588cd0d55dd
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Fri Jul 25 07:41:48 2025 +0000

        fix loop-dim mismatch and improve c_shuffle alu parallelism

    commit 4e93f0c5e27806adc070e4caa81661069295751c
    Merge: 3f12ef5aa 0eb7455f1
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Thu Jul 24 08:46:51 2025 +0000

        merge flatmm -scale

    commit 3f12ef5aa52ced1bff3bfb57b878358330e9e095
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Thu Jul 24 16:19:58 2025 +0800

        revert delete of inc file

    commit 08c3a0d184d7581dc5be364f5b36f16fb4a8d6fa
    Author: solin <bingzhou@amd.com>
    Date:   Thu Jul 24 04:38:16 2025 +0000

        reorg  flatmm code

    commit 0eb7455f106604d5254ed16b0daeda68e2a148e3
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Jul 23 19:12:31 2025 +0000

        fix flatmm syntax error on gfx950

    commit 695ff87e68fdcbe28452c1805cd4dbb643c45495
    Author: Feng Shijie <Shijie.Feng@amd.com>
    Date:   Wed Jul 23 19:04:22 2025 +0000

        support flatmm scaling

    commit e3c29d9dea8758db96b998982ccc8bd1c4e8298d
    Author: valarLip <340077269@qq.com>
    Date:   Wed Jul 23 08:44:12 2025 +0000

        merge flatmm pipe v0 from dteng_flatmm_opt

    commit 425c366fa4c30426ff36cade89b39fd8cb7b9732
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Wed Jul 23 15:38:12 2025 +0800

        build pass

    commit 6b377a9481535696de40f175d7e2159263d21bdc
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Wed Jul 23 07:20:26 2025 +0000

         fix bug

    commit b6dc58d1ea676fe480c0243ae098c875498f6d6a
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Wed Jul 23 15:01:53 2025 +0800

        sync

    commit 904359f401866ee810484e6b8f5b46d79d9e25c8
    Author: valarLip <340077269@qq.com>
    Date:   Tue Jul 22 08:09:35 2025 +0000

        adaptive scheduler instead of Macro definition

    commit f29916c17228c17de9923aab62e7d72d7a30f4e9
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Thu Jul 17 08:40:35 2025 +0000

        fix tail handler bug

    commit e2c60a90929fec955d91db909d50db538d58363b
    Author: lalala-sh <Jiaxing.Wen@amd.com>
    Date:   Wed Jul 16 10:12:19 2025 +0000

        merge from dteng_flatmm_opt

---------

Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com>
Co-authored-by: AMD-dteng <dteng@amd.com>
Co-authored-by: solin <bingzhou@amd.com>
Co-authored-by: sjfeng <j514681085@icloud.com>
Co-authored-by: valarLip <340077269@qq.com>
Co-authored-by: asleepzzz <hanwen.chang@amd.com>
Co-authored-by: Feng Shijie <Shijie.Feng@amd.com>
Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: Gino Lu <gino.lu@amd.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>

* Fix crash on small M

* Apply suggestion from @Copilot

---------

Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com>
Co-authored-by: AMD-dteng <dteng@amd.com>
Co-authored-by: solin <bingzhou@amd.com>
Co-authored-by: sjfeng <j514681085@icloud.com>
Co-authored-by: valarLip <340077269@qq.com>
Co-authored-by: asleepzzz <hanwen.chang@amd.com>
Co-authored-by: Feng Shijie <Shijie.Feng@amd.com>
Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: Gino Lu <gino.lu@amd.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>

[ROCm/composable_kernel commit: e135dd518d]
2025-10-31 11:29:05 +08:00

839 lines
33 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <typename ADataType,
typename QDataType,
typename BDataType,
typename AccDataType,
typename CDataType,
uint32_t QuantGroupSize,
bool aquant,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
const HostTensor<QDataType>& q,
const HostTensor<BDataType>& b_k_n,
HostTensor<CDataType>& c_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mn = [&](auto m, auto n) {
AccDataType v_acc = 0, v_block_acc = 0;
static_assert(std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
std::is_same_v<ADataType, bf8_t>);
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t> ||
std::is_same_v<BDataType, pk_int4_t>);
static_assert(std::is_same_v<AccDataType, float>);
static_assert(std::is_same_v<CDataType, float> ||
std::is_same_v<CDataType, ck_tile::half_t>);
for(std::size_t k = 0; k < K; ++k)
{
AccDataType v_a;
AccDataType v_b;
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
{
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else
{
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
}
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
{
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else if constexpr(std::is_same_v<BDataType, fp8_t>)
{
v_b = fp8_to_float_raw(b_element_op(b_k_n(k, n)));
}
else
{
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
}
v_block_acc += v_a * v_b;
// Apply group dequant scale
if((k + 1) % QuantGroupSize == 0)
{
float scale = 0.f;
index_t outer_dim = (aquant) ? m : k / QuantGroupSize;
index_t inner_dim = (aquant) ? k / QuantGroupSize : n;
if constexpr(std::is_same_v<QDataType, float>)
{
scale = q(outer_dim, inner_dim);
}
else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
{
scale = fp8_to_float_raw(q(outer_dim, inner_dim));
}
else if constexpr(std::is_same_v<QDataType, ck_tile::bf8_t>)
{
scale = bf8_to_float_raw(q(outer_dim, inner_dim));
}
else
{
static_assert(false, "Unexpected Q datatype.");
}
v_block_acc *= scale;
v_acc += v_block_acc;
v_block_acc = 0;
}
}
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
};
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
std::cout << std::endl;
}
template <typename ADataType,
typename AQDataType,
typename BDataType,
typename BQDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_rowcol_quant(const HostTensor<ADataType>& a_m_k,
const HostTensor<AQDataType>& aq_m_1,
const HostTensor<BDataType>& b_k_n,
const HostTensor<BQDataType>& bq_1_n,
HostTensor<CDataType>& c_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
static_assert(std::is_same_v<AccDataType, float>);
static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mn = [&](auto m, auto n) {
// Init accumulator
AccDataType v_acc = 0;
// Get row scale for A and column scale for B
float a_scale = aq_m_1(m, 0);
float b_scale = bq_1_n(0, n);
// Compute the dot product
for(std::size_t k = 0; k < K; ++k)
{
AccDataType v_a;
AccDataType v_b;
// Process A data
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
{
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else
{
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
}
// Process B data
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
{
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else
{
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
}
v_acc += v_a * v_b;
}
v_acc = v_acc * a_scale * b_scale;
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
};
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename AQDataType,
typename BDataType,
typename BQDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_tensor_quant(const HostTensor<ADataType>& a_m_k,
const HostTensor<AQDataType>& aq_1_1,
const HostTensor<BDataType>& b_k_n,
const HostTensor<BQDataType>& bq_1_1,
HostTensor<CDataType>& c_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
static_assert(std::is_same_v<AccDataType, float>);
static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mn = [&](auto m, auto n) {
// Init accumulator
AccDataType v_acc = 0;
// Get scale for A and scale for B
const AccDataType a_scale = ck_tile::type_convert<AccDataType>(aq_1_1(0, 0));
const AccDataType b_scale = ck_tile::type_convert<AccDataType>(bq_1_1(0, 0));
// Compute the dot product
for(std::size_t k = 0; k < K; ++k)
{
AccDataType v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
AccDataType v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
v_acc += v_a * v_b;
}
v_acc = v_acc * a_scale * b_scale;
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
};
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
const HostTensor<BDataType>& b_k_n,
HostTensor<CDataType>& c_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mn = [&](auto m, auto n) {
AccDataType v_acc = 0;
for(std::size_t k = 0; k < K; ++k)
{
AccDataType v_a;
AccDataType v_b;
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
{
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else
{
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
}
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
{
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else
{
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
}
v_acc += v_a * v_b;
}
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
};
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename AsDataType,
typename BsDataType,
typename DsDataType,
typename AccDataType,
typename CDataType,
typename AElementOp,
typename BElementOp,
typename CDElementOp,
typename ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>,
typename BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>,
typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void
reference_gemm_multiple_abd(const std::array<HostTensor<ADataType>, AsDataType::size()>& as_m_k,
const std::array<HostTensor<BDataType>, BsDataType::size()>& bs_k_n,
const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
HostTensor<ADataType>& a_m_k,
HostTensor<BDataType>& b_k_n,
HostTensor<CDataType>& c_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const CDElementOp& acc_element_op = {})
{
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto as_m_k_tuple =
generate_tie([&](auto idx) -> auto& { return as_m_k[idx]; }, number<AsDataType::size()>{});
auto bs_k_n_tuple =
generate_tie([&](auto idx) -> auto& { return bs_k_n[idx]; }, number<BsDataType::size()>{});
auto ds_m_n_tuple =
generate_tie([&](auto idx) -> auto& { return ds_m_n[idx]; }, number<DsDataType::size()>{});
// Apply elementwise function to A
auto a_elementwise_fn = [&](auto i, auto j) {
ck_tile::apply([&](auto&&... t) { a_element_op(a_m_k(i, j), t(i, j)...); }, as_m_k_tuple);
};
make_ParallelTensorFunctor(a_elementwise_fn, M, K)(std::thread::hardware_concurrency());
// Apply elementwise function to B
auto b_elementwise_fn = [&](auto i, auto j) {
ck_tile::apply([&](auto&&... t) { b_element_op(b_k_n(i, j), t(i, j)...); }, bs_k_n_tuple);
};
make_ParallelTensorFunctor(b_elementwise_fn, K, N)(std::thread::hardware_concurrency());
auto f_mk_kn_mn = [&](auto m, auto n) {
AccDataType v_acc = 0;
for(std::size_t k = 0; k < K; ++k)
{
ADataType v_a = a_m_k(m, k);
BDataType v_b = b_k_n(k, n);
v_acc +=
ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
}
CDataType v_c = 0;
ck_tile::apply(
[&](auto&&... t) {
acc_element_op(v_c,
ck_tile::type_convert<float>(v_acc),
ck_tile::type_convert<float>(t(m, n))...);
},
ds_m_n_tuple);
c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
};
make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename ScaleDataType,
typename AccDataType,
typename CDataType,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_mx_gemm(const HostTensor<ADataType>& a_m_k,
const HostTensor<BDataType>& b_k_n,
HostTensor<CDataType>& c_m_n,
const HostTensor<ScaleDataType>& scale_a,
const HostTensor<ScaleDataType>& scale_b,
const AElementOp& = {},
const BElementOp& = {},
const ACCElementOp& = {})
{
static_assert(std::is_same_v<AElementOp, ck_tile::identity>);
static_assert(std::is_same_v<BElementOp, ck_tile::identity>);
static_assert(std::is_same_v<ACCElementOp, ck_tile::identity>);
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
const std::size_t ScaleBlockSize = K / scale_a.get_length(1);
HostTensor<AccDataType> a_m_k_scaled({std::size_t(M), std::size_t(K)},
{std::size_t(K), std::size_t(1)});
HostTensor<AccDataType> b_k_n_scaled({std::size_t(K), std::size_t(N)},
{std::size_t(1), std::size_t(K)});
for(std::size_t m = 0; m < M; ++m)
{
for(std::size_t k = 0; k < K; ++k)
{
if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
{
if(k % 2 == 1)
continue; // skip odd k
auto a_f4x2 = a_m_k(m, k);
auto a_scale = ck_tile::type_convert<AccDataType>(scale_a(m, k / ScaleBlockSize));
auto a_f4_lo =
ck_tile::type_convert<AccDataType>(a_f4x2.template unpack<>(number<0>{}));
auto a_f4_hi =
ck_tile::type_convert<AccDataType>(a_f4x2.template unpack<>(number<1>{}));
a_m_k_scaled(m, k) = a_f4_lo * a_scale;
a_m_k_scaled(m, k + 1) = a_f4_hi * a_scale;
}
}
}
for(std::size_t n = 0; n < N; n++)
{
for(std::size_t k = 0; k < K; k++)
{
if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
{
if(k % 2 == 1)
continue; // skip odd k
auto b_f4x2 = b_k_n(k, n);
auto b_scale = ck_tile::type_convert<AccDataType>(scale_b(k / ScaleBlockSize, n));
auto b_f4_lo =
ck_tile::type_convert<AccDataType>(b_f4x2.template unpack<>(number<0>{}));
auto b_f4_hi =
ck_tile::type_convert<AccDataType>(b_f4x2.template unpack<>(number<1>{}));
b_k_n_scaled(k, n) = b_f4_lo * b_scale;
b_k_n_scaled(k + 1, n) = b_f4_hi * b_scale;
}
else
{
b_k_n_scaled(k, n) =
ck_tile::type_convert<AccDataType>((b_k_n(k, n))) *
ck_tile::type_convert<AccDataType>(scale_b(k / ScaleBlockSize, n));
}
}
}
// call reference gemm
reference_gemm<AccDataType, AccDataType, AccDataType, CDataType>(
a_m_k_scaled, b_k_n_scaled, c_m_n);
}
template <typename ADataType,
typename BDataType,
typename DsDataType,
typename AccDataType,
typename CDataType,
typename ACCElementOp,
typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void
reference_gemm_multiple_d(const HostTensor<ADataType>& a_m_k,
const HostTensor<BDataType>& b_k_n,
const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
HostTensor<CDataType>& c_m_n,
const ACCElementOp& acc_element_op = {})
{
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mk_kn_mn = [&](auto m, auto n) {
AccDataType v_acc = 0;
for(std::size_t k = 0; k < K; ++k)
{
ADataType v_a = a_m_k(m, k);
BDataType v_b = b_k_n(k, n);
v_acc +=
ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
}
CDataType v_c = 0;
if constexpr(DsDataType::size() == 0)
{
acc_element_op(v_c, ck_tile::type_convert<float>(v_acc));
}
else if constexpr(DsDataType::size() == 1)
{
acc_element_op(v_c,
ck_tile::type_convert<float>(v_acc),
ck_tile::type_convert<float>(ds_m_n[0](m, n)));
}
else if constexpr(DsDataType::size() == 2)
{
acc_element_op(v_c,
ck_tile::type_convert<float>(v_acc),
ck_tile::type_convert<float>(ds_m_n[0](m, n)),
ck_tile::type_convert<float>(ds_m_n[1](m, n)));
}
c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
};
make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC>
__global__ void naive_gemm_kernel(ADataType* A,
BDataType* B,
CDataType* C,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int row = idx / N; // Compute row index
int col = idx % N; // Compute column index
if(row < M && col < N)
{
AccDataType acc = 0.0;
for(int k = 0; k < K; ++k)
{
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
// Adjust indexing based on matrix layout
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
? row * strideA + k
: k * strideA + row;
int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
? col * strideB + k
: k * strideB + col;
AccDataType v_a;
AccDataType v_b;
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
{
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
{
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else
{
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
}
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
{
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
{
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b]);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else
{
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
}
acc += v_a * v_b;
}
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
? row * strideC + col
: col * strideC + row;
C[c_index] = ck_tile::type_convert<CDataType>(acc);
}
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC>
__global__ void blockwise_gemm_kernel(ADataType* A,
BDataType* B,
CDataType* C,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC,
ck_tile::index_t scale_granularity_m,
ck_tile::index_t scale_granularity_n,
ck_tile::index_t scale_granularity_k,
float* scale_A_ptr,
float* scale_B_ptr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int row = idx / N; // Compute row index
int col = idx % N; // Compute column index
if(row < M && col < N)
{
AccDataType acc = 0.0, acc_temp = 0.0;
index_t scale_A_stride = (M + scale_granularity_m - 1) / scale_granularity_m;
index_t scale_B_stride = (N + scale_granularity_n - 1) / scale_granularity_n;
float scale_A = 0;
float scale_B = 0;
for(int k = 0; k < K; ++k)
{
if(k % scale_granularity_k == 0)
{
// update acc
acc += acc_temp * scale_A * scale_B;
acc_temp = 0.0;
// update scale factors
scale_A = scale_A_ptr[(row / scale_granularity_m) +
(k / scale_granularity_k) * scale_A_stride];
scale_B = scale_B_ptr[(col / scale_granularity_n) +
(k / scale_granularity_k) * scale_B_stride];
}
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
// Adjust indexing based on matrix layout
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
? row * strideA + k
: k * strideA + row;
int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
? col * strideB + k
: k * strideB + col;
AccDataType v_a;
AccDataType v_b;
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
{
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
{
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
if(k % 2 == 1)
v_a = fp32_val.hi;
else
v_a = fp32_val.lo;
}
else
{
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
}
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
{
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
{
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
if(k % 2 == 1)
v_b = fp32_val.hi;
else
v_b = fp32_val.lo;
}
else
{
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
}
acc_temp += v_a * v_b;
}
// final accumulation
acc += acc_temp * scale_A * scale_B;
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
? row * strideC + col
: col * strideC + row;
C[c_index] = ck_tile::type_convert<CDataType>(acc);
}
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC>
void reference_gemm_gpu(ADataType* a_ptr,
BDataType* b_ptr,
CDataType* c_ptr,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c)
{
int totalElements = M * N;
int numThreadsPerBlock = 256; // Common choice for threads per block
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
<<<numBlocks, numThreadsPerBlock>>>(
a_ptr, b_ptr, c_ptr, M, N, K, stride_a, stride_b, stride_c);
return;
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC>
void reference_blockwise_gemm_gpu(ADataType* a_ptr,
BDataType* b_ptr,
CDataType* c_ptr,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c,
index_t scale_granularity_m,
index_t scale_granularity_n,
index_t scale_granularity_k,
float* scale_A_ptr,
float* scale_B_ptr)
{
int totalElements = M * N;
int numThreadsPerBlock = 256; // Common choice for threads per block
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
blockwise_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
<<<numBlocks, numThreadsPerBlock>>>(a_ptr,
b_ptr,
c_ptr,
M,
N,
K,
stride_a,
stride_b,
stride_c,
scale_granularity_m,
scale_granularity_n,
scale_granularity_k,
scale_A_ptr,
scale_B_ptr);
return;
}
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename LayoutA,
typename LayoutB,
typename LayoutC>
void reference_batched_gemm_gpu(ADataType* a_ptr,
BDataType* b_ptr,
CDataType* c_ptr,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c,
index_t batch_stride_A,
index_t batch_stride_B,
index_t batch_stride_C,
index_t batch_count)
{
int totalElements = M * N;
int numThreadsPerBlock = 256; // Common choice for threads per block
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
for(index_t batch_id = 0; batch_id < batch_count; ++batch_id)
{
ADataType* d_ATemp = a_ptr + batch_id * batch_stride_A;
BDataType* d_BTemp = b_ptr + batch_id * batch_stride_B;
CDataType* d_CTemp = c_ptr + batch_id * batch_stride_C;
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
<<<numBlocks, numThreadsPerBlock>>>(
d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
}
return;
}
} // namespace ck_tile