About Social Code
aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.clang-format-include1
-rw-r--r--docs/teflon.rst13
-rw-r--r--include/drm-uapi/rknpu_ioctl.h314
-rw-r--r--include/drm-uapi/rocket_accel.h142
-rw-r--r--meson.build3
-rw-r--r--meson.options2
-rw-r--r--src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt126
-rw-r--r--src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt5
-rw-r--r--src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt29
-rw-r--r--src/gallium/drivers/rocket/decode.py75
-rw-r--r--src/gallium/drivers/rocket/extract_registers.py121
-rw-r--r--src/gallium/drivers/rocket/gen_header.py137
-rw-r--r--src/gallium/drivers/rocket/gen_parser.py737
-rw-r--r--src/gallium/drivers/rocket/intercept.c371
-rw-r--r--src/gallium/drivers/rocket/meson.build38
-rw-r--r--src/gallium/drivers/rocket/registers.xml1179
-rw-r--r--src/gallium/drivers/rocket/rkt_coefs.c227
-rw-r--r--src/gallium/drivers/rocket/rkt_coefs.h20
-rw-r--r--src/gallium/drivers/rocket/rkt_device.c232
-rw-r--r--src/gallium/drivers/rocket/rkt_device.h75
-rw-r--r--src/gallium/drivers/rocket/rkt_ml.c631
-rw-r--r--src/gallium/drivers/rocket/rkt_ml.h151
-rw-r--r--src/gallium/drivers/rocket/rkt_regcmd.c544
-rw-r--r--src/gallium/drivers/rocket/rkt_regcmd.h15
-rw-r--r--src/gallium/drivers/rocket/rkt_task.c352
-rw-r--r--src/gallium/drivers/rocket/rkt_task.h14
-rw-r--r--src/gallium/drivers/rocket/rules-ng.xsd457
-rw-r--r--src/gallium/meson.build6
-rw-r--r--src/gallium/targets/dri/meson.build2
-rw-r--r--src/gallium/targets/dril/meson.build3
-rw-r--r--src/gallium/winsys/rocket/drm/meson.build13
-rw-r--r--src/gallium/winsys/rocket/drm/rkt_drm_public.h17
-rw-r--r--src/gallium/winsys/rocket/drm/rkt_drm_winsys.c19
33 files changed, 6064 insertions, 7 deletions
diff --git a/.clang-format-include b/.clang-format-include
index d40db0e1d0b..9f2d1dd1977 100644
--- a/.clang-format-include
+++ b/.clang-format-include
@@ -3,6 +3,7 @@
src/gallium/drivers/i915
src/gallium/drivers/r300/compiler/*
+src/gallium/drivers/rocket/**/*
src/gallium/targets/teflon/**/*
src/gallium/frontends/teflon/**/*
src/amd/vulkan/**/*
diff --git a/docs/teflon.rst b/docs/teflon.rst
index 3935207ce39..ffed65cf759 100644
--- a/docs/teflon.rst
+++ b/docs/teflon.rst
@@ -15,6 +15,9 @@ Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate
* - Etnaviv
- ``VeriSilicon VIPNano-SI+.8002``
- ``NXP iMX8M Plus on Toradex Verdin SoM``
+ * - Rocket
+ - ``RK3588 NPU``
+ - ``PINE64 QuartzPro64``
.. list-table:: Tested models
:header-rows: 1
@@ -25,29 +28,33 @@ Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate
- Status
- Inference speed on AML-A311D-CC Alta
- Inference speed on Verdin iMX8M Plus
+ - Inference speed on QuartzPro64
* - MobileNet V1
- UINT8
- http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz
- Fully supported
- ~6.6 ms
- ~7.9 ms
+ - ~18 ms
* - MobileNet V2
- UINT8
- https://storage.googleapis.com/mobilenet_v2/checkpoints/quantized_v2_224_100.tgz
- Fully supported
- ~6.9 ms
- ~8.0 ms
+ - ~21 ms
* - SSDLite MobileDet
- UINT8
- https://raw.githubusercontent.com/google-coral/test_data/master/ssdlite_mobiledet_coco_qat_postprocess.tflite
- Fully supported
- ~24.8 ms
- ~24.4 ms
+ - ~48 ms
Build
-----
-Build Mesa as usual, with the -Dteflon=true argument.
+Build Mesa as usual, with the -Dteflon=true argument. Make sure at least one of etnaviv or rocket gallium drivers is enabled, as Teflon only works with these drivers.
Example instructions:
@@ -62,7 +69,7 @@ Example instructions:
# Build Mesa
~ $ cd mesa
- mesa $ meson setup build -Dgallium-drivers=etnaviv -Dvulkan-drivers= -Dteflon=true
+ mesa $ meson setup build -Dgallium-drivers=etnaviv,rocket -Dvulkan-drivers= -Dteflon=true
mesa $ meson compile -C build
Install runtime dependencies
@@ -99,7 +106,7 @@ This example script has been based from the code in https://github.com/tensorflo
~ $ cd mesa/
mesa $ TEFLON_DEBUG=verbose ETNA_MESA_DEBUG=ml_dbgs python3.10 src/gallium/frontends/teflon/tests/classification.py \
-i ~/tensorflow/assets/grace_hopper.bmp \
- -m src/gallium/targets/teflon/tests/mobilenet_v1_1.0_224_quant.tflite \
+ -m src/gallium/targets/teflon/tests/models/mobilenetv1/mobilenet_v1_1_224_quant.tflite \
-l src/gallium/frontends/teflon/tests/labels_mobilenet_quant_v1_224.txt \
-e build/src/gallium/targets/teflon/libteflon.so
diff --git a/include/drm-uapi/rknpu_ioctl.h b/include/drm-uapi/rknpu_ioctl.h
new file mode 100644
index 00000000000..54d79636b16
--- /dev/null
+++ b/include/drm-uapi/rknpu_ioctl.h
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Fuzhou Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_IOCTL_H
+#define __LINUX_RKNPU_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#if !defined(__KERNEL__)
+#define __user
+#endif
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+#define RKNPU_OFFSET_VERSION 0x0
+#define RKNPU_OFFSET_PC_OP_EN 0x8
+#define RKNPU_OFFSET_PC_DATA_ADDR 0x10
+#define RKNPU_OFFSET_PC_DATA_AMOUNT 0x14
+#define RKNPU_OFFSET_PC_TASK_CONTROL 0x30
+#define RKNPU_OFFSET_PC_DMA_BASE_ADDR 0x34
+#define RKNPU_OFFSET_PC_TASK_STATUS 0x3c
+
+#define RKNPU_OFFSET_INT_MASK 0x20
+#define RKNPU_OFFSET_INT_CLEAR 0x24
+#define RKNPU_OFFSET_INT_STATUS 0x28
+#define RKNPU_OFFSET_INT_RAW_STATUS 0x2c
+
+#define RKNPU_OFFSET_CLR_ALL_RW_AMOUNT 0x8010
+#define RKNPU_OFFSET_DT_WR_AMOUNT 0x8034
+#define RKNPU_OFFSET_DT_RD_AMOUNT 0x8038
+#define RKNPU_OFFSET_WT_RD_AMOUNT 0x803c
+
+#define RKNPU_OFFSET_ENABLE_MASK 0xf008
+
+#define RKNPU_INT_CLEAR 0x1ffff
+
+#define RKNPU_PC_DATA_EXTRA_AMOUNT 4
+
+#define RKNPU_STR_HELPER(x) #x
+
+#define RKNPU_GET_DRV_VERSION_STRING(MAJOR, MINOR, PATCHLEVEL) \
+ RKNPU_STR_HELPER(MAJOR) \
+ "." RKNPU_STR_HELPER(MINOR) "." RKNPU_STR_HELPER(PATCHLEVEL)
+#define RKNPU_GET_DRV_VERSION_CODE(MAJOR, MINOR, PATCHLEVEL) \
+ (MAJOR * 10000 + MINOR * 100 + PATCHLEVEL)
+#define RKNPU_GET_DRV_VERSION_MAJOR(CODE) (CODE / 10000)
+#define RKNPU_GET_DRV_VERSION_MINOR(CODE) ((CODE % 10000) / 100)
+#define RKNPU_GET_DRV_VERSION_PATCHLEVEL(CODE) (CODE % 100)
+
+/* memory type definitions. */
+enum e_rknpu_mem_type {
+ /* physically continuous memory and used as default. */
+ RKNPU_MEM_CONTIGUOUS = 0 << 0,
+ /* physically non-continuous memory. */
+ RKNPU_MEM_NON_CONTIGUOUS = 1 << 0,
+ /* non-cacheable mapping and used as default. */
+ RKNPU_MEM_NON_CACHEABLE = 0 << 1,
+ /* cacheable mapping. */
+ RKNPU_MEM_CACHEABLE = 1 << 1,
+ /* write-combine mapping. */
+ RKNPU_MEM_WRITE_COMBINE = 1 << 2,
+ /* dma attr kernel mapping */
+ RKNPU_MEM_KERNEL_MAPPING = 1 << 3,
+ /* iommu mapping */
+ RKNPU_MEM_IOMMU = 1 << 4,
+ /* zero mapping */
+ RKNPU_MEM_ZEROING = 1 << 5,
+ /* allocate secure buffer */
+ RKNPU_MEM_SECURE = 1 << 6,
+ /* allocate from non-dma32 zone */
+ RKNPU_MEM_NON_DMA32 = 1 << 7,
+ RKNPU_MEM_MASK = RKNPU_MEM_NON_CONTIGUOUS | RKNPU_MEM_CACHEABLE |
+ RKNPU_MEM_WRITE_COMBINE | RKNPU_MEM_KERNEL_MAPPING |
+ RKNPU_MEM_IOMMU | RKNPU_MEM_ZEROING |
+ RKNPU_MEM_SECURE | RKNPU_MEM_NON_DMA32
+};
+
+/* sync mode definitions. */
+enum e_rknpu_mem_sync_mode {
+ RKNPU_MEM_SYNC_TO_DEVICE = 1 << 0,
+ RKNPU_MEM_SYNC_FROM_DEVICE = 1 << 1,
+ RKNPU_MEM_SYNC_MASK =
+ RKNPU_MEM_SYNC_TO_DEVICE | RKNPU_MEM_SYNC_FROM_DEVICE
+};
+
+/* job mode definitions. */
+enum e_rknpu_job_mode {
+ RKNPU_JOB_SLAVE = 0 << 0,
+ RKNPU_JOB_PC = 1 << 0,
+ RKNPU_JOB_BLOCK = 0 << 1,
+ RKNPU_JOB_NONBLOCK = 1 << 1,
+ RKNPU_JOB_PINGPONG = 1 << 2,
+ RKNPU_JOB_FENCE_IN = 1 << 3,
+ RKNPU_JOB_FENCE_OUT = 1 << 4,
+ RKNPU_JOB_MASK = RKNPU_JOB_PC | RKNPU_JOB_NONBLOCK |
+ RKNPU_JOB_PINGPONG | RKNPU_JOB_FENCE_IN |
+ RKNPU_JOB_FENCE_OUT
+};
+
+/* action definitions */
+enum e_rknpu_action {
+ RKNPU_GET_HW_VERSION = 0,
+ RKNPU_GET_DRV_VERSION = 1,
+ RKNPU_GET_FREQ = 2,
+ RKNPU_SET_FREQ = 3,
+ RKNPU_GET_VOLT = 4,
+ RKNPU_SET_VOLT = 5,
+ RKNPU_ACT_RESET = 6,
+ RKNPU_GET_BW_PRIORITY = 7,
+ RKNPU_SET_BW_PRIORITY = 8,
+ RKNPU_GET_BW_EXPECT = 9,
+ RKNPU_SET_BW_EXPECT = 10,
+ RKNPU_GET_BW_TW = 11,
+ RKNPU_SET_BW_TW = 12,
+ RKNPU_ACT_CLR_TOTAL_RW_AMOUNT = 13,
+ RKNPU_GET_DT_WR_AMOUNT = 14,
+ RKNPU_GET_DT_RD_AMOUNT = 15,
+ RKNPU_GET_WT_RD_AMOUNT = 16,
+ RKNPU_GET_TOTAL_RW_AMOUNT = 17,
+ RKNPU_GET_IOMMU_EN = 18,
+ RKNPU_SET_PROC_NICE = 19,
+ RKNPU_POWER_ON = 20,
+ RKNPU_POWER_OFF = 21,
+};
+
+/**
+ * User-desired buffer creation information structure.
+ *
+ * @handle: The handle of the created GEM object.
+ * @flags: user request for setting memory type or cache attributes.
+ * @size: user-desired memory allocation size.
+ * - this size value would be page-aligned internally.
+ * @obj_addr: address of RKNPU memory object.
+ * @dma_addr: dma address that access by rknpu.
+ */
+struct rknpu_mem_create {
+ __u32 handle;
+ __u32 flags;
+ __u64 size;
+ __u64 obj_addr;
+ __u64 dma_addr;
+};
+
+/**
+ * A structure for getting a fake-offset that can be used with mmap.
+ *
+ * @handle: handle of gem object.
+ * @reserved: just padding to be 64-bit aligned.
+ * @offset: a fake-offset of gem object.
+ */
+struct rknpu_mem_map {
+ __u32 handle;
+ __u32 reserved;
+ __u64 offset;
+};
+
+/**
+ * For destroying DMA buffer
+ *
+ * @handle: handle of the buffer.
+ * @reserved: reserved for padding.
+ * @obj_addr: rknpu_mem_object addr.
+ */
+struct rknpu_mem_destroy {
+ __u32 handle;
+ __u32 reserved;
+ __u64 obj_addr;
+};
+
+/**
+ * For synchronizing DMA buffer
+ *
+ * @flags: user request for setting memory type or cache attributes.
+ * @reserved: reserved for padding.
+ * @obj_addr: address of RKNPU memory object.
+ * @offset: offset in bytes from start address of buffer.
+ * @size: size of memory region.
+ *
+ */
+struct rknpu_mem_sync {
+ __u32 flags;
+ __u32 reserved;
+ __u64 obj_addr;
+ __u64 offset;
+ __u64 size;
+};
+
+/**
+ * struct rknpu_task structure for task information
+ *
+ * @flags: flags for task
+ * @op_idx: operator index
+ * @enable_mask: enable mask
+ * @int_mask: interrupt mask
+ * @int_clear: interrupt clear
+ * @int_status: interrupt status
+ * @regcfg_amount: register config number
+ * @regcfg_offset: offset for register config
+ * @regcmd_addr: address for register command
+ *
+ */
+struct rknpu_task {
+ __u32 flags;
+ __u32 op_idx;
+ __u32 enable_mask;
+ __u32 int_mask;
+ __u32 int_clear;
+ __u32 int_status;
+ __u32 regcfg_amount;
+ __u32 regcfg_offset;
+ __u64 regcmd_addr;
+} __packed;
+
+/**
+ * struct rknpu_subcore_task structure for subcore task index
+ *
+ * @task_start: task start index
+ * @task_number: task number
+ *
+ */
+struct rknpu_subcore_task {
+ __u32 task_start;
+ __u32 task_number;
+};
+
+/**
+ * struct rknpu_submit structure for job submit
+ *
+ * @flags: flags for job submit
+ * @timeout: submit timeout
+ * @task_start: task start index
+ * @task_number: task number
+ * @task_counter: task counter
+ * @priority: submit priority
+ * @task_obj_addr: address of task object
+ * @regcfg_obj_addr: address of register config object
+ * @task_base_addr: task base address
+ * @user_data: (optional) user data
+ * @core_mask: core mask of rknpu
+ * @fence_fd: dma fence fd
+ * @subcore_task: subcore task
+ *
+ */
+struct rknpu_submit {
+ __u32 flags;
+ __u32 timeout;
+ __u32 task_start;
+ __u32 task_number;
+ __u32 task_counter;
+ __s32 priority;
+ __u64 task_obj_addr;
+ __u64 regcfg_obj_addr;
+ __u64 task_base_addr;
+ __u64 user_data;
+ __u32 core_mask;
+ __s32 fence_fd;
+ struct rknpu_subcore_task subcore_task[5];
+};
+
+/**
+ * struct rknpu_task structure for action (GET, SET or ACT)
+ *
+ * @flags: flags for action
+ * @value: GET or SET value
+ *
+ */
+struct rknpu_action {
+ __u32 flags;
+ __u32 value;
+};
+
+#define RKNPU_ACTION 0x00
+#define RKNPU_SUBMIT 0x01
+#define RKNPU_MEM_CREATE 0x02
+#define RKNPU_MEM_MAP 0x03
+#define RKNPU_MEM_DESTROY 0x04
+#define RKNPU_MEM_SYNC 0x05
+
+#define RKNPU_IOC_MAGIC 'r'
+#define RKNPU_IOW(nr, type) _IOW(RKNPU_IOC_MAGIC, nr, type)
+#define RKNPU_IOR(nr, type) _IOR(RKNPU_IOC_MAGIC, nr, type)
+#define RKNPU_IOWR(nr, type) _IOWR(RKNPU_IOC_MAGIC, nr, type)
+
+#include <drm.h>
+
+#define DRM_IOCTL_RKNPU_ACTION \
+ DRM_IOWR(DRM_COMMAND_BASE + RKNPU_ACTION, struct rknpu_action)
+#define DRM_IOCTL_RKNPU_SUBMIT \
+ DRM_IOWR(DRM_COMMAND_BASE + RKNPU_SUBMIT, struct rknpu_submit)
+#define DRM_IOCTL_RKNPU_MEM_CREATE \
+ DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_CREATE, struct rknpu_mem_create)
+#define DRM_IOCTL_RKNPU_MEM_MAP \
+ DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_MAP, struct rknpu_mem_map)
+#define DRM_IOCTL_RKNPU_MEM_DESTROY \
+ DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_DESTROY, struct rknpu_mem_destroy)
+#define DRM_IOCTL_RKNPU_MEM_SYNC \
+ DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_SYNC, struct rknpu_mem_sync)
+
+#define IOCTL_RKNPU_ACTION RKNPU_IOWR(RKNPU_ACTION, struct rknpu_action)
+#define IOCTL_RKNPU_SUBMIT RKNPU_IOWR(RKNPU_SUBMIT, struct rknpu_submit)
+#define IOCTL_RKNPU_MEM_CREATE \
+ RKNPU_IOWR(RKNPU_MEM_CREATE, struct rknpu_mem_create)
+#define IOCTL_RKNPU_MEM_MAP RKNPU_IOWR(RKNPU_MEM_MAP, struct rknpu_mem_map)
+#define IOCTL_RKNPU_MEM_DESTROY \
+ RKNPU_IOWR(RKNPU_MEM_DESTROY, struct rknpu_mem_destroy)
+#define IOCTL_RKNPU_MEM_SYNC RKNPU_IOWR(RKNPU_MEM_SYNC, struct rknpu_mem_sync)
+
+#endif
diff --git a/include/drm-uapi/rocket_accel.h b/include/drm-uapi/rocket_accel.h
new file mode 100644
index 00000000000..14b2e12b7c4
--- /dev/null
+++ b/include/drm-uapi/rocket_accel.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Tomeu Vizoso
+ */
+#ifndef __DRM_UAPI_ROCKET_ACCEL_H__
+#define __DRM_UAPI_ROCKET_ACCEL_H__
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_ROCKET_CREATE_BO 0x00
+#define DRM_ROCKET_SUBMIT 0x01
+#define DRM_ROCKET_PREP_BO 0x02
+#define DRM_ROCKET_FINI_BO 0x03
+
+#define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo)
+#define DRM_IOCTL_ROCKET_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit)
+#define DRM_IOCTL_ROCKET_PREP_BO DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_PREP_BO, struct drm_rocket_prep_bo)
+#define DRM_IOCTL_ROCKET_FINI_BO DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_FINI_BO, struct drm_rocket_fini_bo)
+
+/**
+ * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs.
+ *
+ */
+struct drm_rocket_create_bo {
+ /** Input: Size of the requested BO. */
+ __u32 size;
+
+ /** Output: GEM handle for the BO. */
+ __u32 handle;
+
+ /**
+ * Output: DMA address for the BO in the NPU address space. This address
+ * is private to the DRM fd and is valid for the lifetime of the GEM
+ * handle.
+ */
+ __u64 dma_address;
+
+ /** Output: Offset into the drm node to use for subsequent mmap call. */
+ __u64 offset;
+};
+
+/**
+ * struct drm_rocket_prep_bo - ioctl argument for starting CPU ownership of the BO.
+ *
+ * Takes care of waiting for any NPU jobs that might still use the NPU and performs cache
+ * synchronization.
+ */
+struct drm_rocket_prep_bo {
+ /** Input: GEM handle of the buffer object. */
+ __u32 handle;
+
+ /** Reserved, must be zero. */
+ __u32 reserved;
+
+ /** Input: Amount of time to wait for NPU jobs. */
+ __s64 timeout_ns;
+};
+
+/**
+ * struct drm_rocket_fini_bo - ioctl argument for finishing CPU ownership of the BO.
+ *
+ * Synchronize caches for NPU access.
+ */
+struct drm_rocket_fini_bo {
+ /** Input: GEM handle of the buffer object. */
+ __u32 handle;
+
+ /** Reserved, must be zero. */
+ __u32 reserved;
+};
+
+/**
+ * struct drm_rocket_task - A task to be run on the NPU
+ *
+ * A task is the smallest unit of work that can be run on the NPU.
+ */
+struct drm_rocket_task {
+ /** Input: DMA address to NPU mapping of register command buffer */
+ __u32 regcmd;
+
+ /** Input: Number of commands in the register command buffer */
+ __u32 regcmd_count;
+};
+
+/**
+ * struct drm_rocket_job - A job to be run on the NPU
+ *
+ * The kernel will schedule the execution of this job taking into account its
+ * dependencies with other jobs. All tasks in the same job will be executed
+ * sequentially on the same core, to benefit from memory residency in SRAM.
+ */
+struct drm_rocket_job {
+ /** Input: Pointer to an array of struct drm_rocket_task. */
+ __u64 tasks;
+
+ /** Input: Pointer to a u32 array of the BOs that are read by the job. */
+ __u64 in_bo_handles;
+
+ /** Input: Pointer to a u32 array of the BOs that are written to by the job. */
+ __u64 out_bo_handles;
+
+ /** Input: Number of tasks passed in. */
+ __u32 task_count;
+
+ /** Input: Size in bytes of the structs in the @tasks field. */
+ __u32 task_struct_size;
+
+ /** Input: Number of input BO handles passed in (size is that times 4). */
+ __u32 in_bo_handle_count;
+
+ /** Input: Number of output BO handles passed in (size is that times 4). */
+ __u32 out_bo_handle_count;
+};
+
+/**
+ * struct drm_rocket_submit - ioctl argument for submitting commands to the NPU.
+ *
+ * The kernel will schedule the execution of these jobs in dependency order.
+ */
+struct drm_rocket_submit {
+ /** Input: Pointer to an array of struct drm_rocket_job. */
+ __u64 jobs;
+
+ /** Input: Number of jobs passed in. */
+ __u32 job_count;
+
+ /** Input: Size in bytes of the structs in the @jobs field. */
+ __u32 job_struct_size;
+
+ /** Reserved, must be zero. */
+ __u64 reserved;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __DRM_UAPI_ROCKET_ACCEL_H__ */
diff --git a/meson.build b/meson.build
index d1a5cc2cb4c..3be192356ec 100644
--- a/meson.build
+++ b/meson.build
@@ -181,7 +181,7 @@ elif gallium_drivers.contains('all')
gallium_drivers = [
'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915',
'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris',
- 'zink', 'd3d12', 'asahi'
+ 'zink', 'd3d12', 'asahi', 'rocket'
]
endif
@@ -208,6 +208,7 @@ with_gallium_lima = gallium_drivers.contains('lima')
with_gallium_zink = gallium_drivers.contains('zink')
with_gallium_d3d12 = gallium_drivers.contains('d3d12')
with_gallium_asahi = gallium_drivers.contains('asahi')
+with_gallium_rocket = gallium_drivers.contains('rocket')
foreach gallium_driver : gallium_drivers
pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
endforeach
diff --git a/meson.options b/meson.options
index cd0e56cc429..51a644ad310 100644
--- a/meson.options
+++ b/meson.options
@@ -82,7 +82,7 @@ option(
'all', 'auto',
'asahi', 'crocus', 'd3d12', 'etnaviv', 'freedreno', 'i915', 'iris',
'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
- 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
+ 'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
],
description : 'List of gallium drivers to build. If this is set to auto ' +
'all drivers applicable to the target OS/architecture ' +
diff --git a/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt b/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt
new file mode 100644
index 00000000000..a86f73b27a7
--- /dev/null
+++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt
@@ -0,0 +1,126 @@
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Conv2D.Op/input_size_112_weight_size_1_input_channels_120_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail
+
+Models.Op/mobiledet_086,Fail
+Models.Op/mobiledet_087,Fail
+Models.Op/mobiledet_ssdlite_mobiledet_coco_qat_postprocess,Fail
+
+Models.Op/yolox_005,Fail
+Models.Op/yolox_007,Fail
+Models.Op/yolox_008,Fail
+Models.Op/yolox_009,Fail
+Models.Op/yolox_010,Fail
+Models.Op/yolox_012,Fail
+Models.Op/yolox_014,Fail
+Models.Op/yolox_016,Fail
+Models.Op/yolox_018,Fail
+Models.Op/yolox_019,Fail
+Models.Op/yolox_021,Fail
+Models.Op/yolox_022,Fail
+Models.Op/yolox_024,Fail
+Models.Op/yolox_025,Fail
+Models.Op/yolox_031,Fail
+Models.Op/yolox_034,Fail
+Models.Op/yolox_037,Fail
+Models.Op/yolox_040,Fail
+Models.Op/yolox_046,Fail
+Models.Op/yolox_055,Fail
+Models.Op/yolox_064,Fail
+Models.Op/yolox_072,Fail
+Models.Op/yolox_073,Fail
+Models.Op/yolox_078,Fail
+Models.Op/yolox_082,Fail
+Models.Op/yolox_087,Fail
+Models.Op/yolox_091,Fail
+Models.Op/yolox_096,Fail
+Models.Op/yolox_097,Fail
+Models.Op/yolox_100,Fail
+Models.Op/yolox_101,Fail
+Models.Op/yolox_107,Fail
+Models.Op/yolox_108,Fail
+Models.Op/yolox_111,Fail
+Models.Op/yolox_112,Fail
+Models.Op/yolox_118,Fail
+Models.Op/yolox_119,Fail
+Models.Op/yolox_122,Fail
+Models.Op/yolox_123,Fail
+Models.Op/yolox_yolox,Fail \ No newline at end of file
diff --git a/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt b/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt
new file mode 100644
index 00000000000..52f9ab3b05c
--- /dev/null
+++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt
@@ -0,0 +1,5 @@
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_1_padding_same_0_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_1_padding_same_1_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_2_padding_same_1_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0 \ No newline at end of file
diff --git a/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt b/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt
new file mode 100644
index 00000000000..06fc099d2a1
--- /dev/null
+++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt
@@ -0,0 +1,29 @@
+Add.Op/.*
+AddQuant.Op/.*
+Conv2D.Op/.*
+DepthwiseConv2D.Op/.*
+FullyConnected.Op/.*
+
+# These tests below (adds) aren't well constructed and thus fail in TF
+Models.Op/mobiledet_008
+Models.Op/mobiledet_011
+Models.Op/mobiledet_014
+Models.Op/mobiledet_019
+Models.Op/mobiledet_022
+Models.Op/mobiledet_025
+Models.Op/mobiledet_032
+Models.Op/mobiledet_035
+Models.Op/mobiledet_038
+Models.Op/mobiledet_045
+Models.Op/mobiledet_049
+Models.Op/mobiledet_053
+Models.Op/mobiledet_060
+Models.Op/mobiledet_064
+Models.Op/mobiledet_068
+Models.Op/yolox_011
+Models.Op/yolox_020
+Models.Op/yolox_023
+Models.Op/yolox_026
+Models.Op/yolox_035
+Models.Op/yolox_038
+Models.Op/yolox_041
diff --git a/src/gallium/drivers/rocket/decode.py b/src/gallium/drivers/rocket/decode.py
new file mode 100644
index 00000000000..6bc4a5780c8
--- /dev/null
+++ b/src/gallium/drivers/rocket/decode.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python3
+#
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+import os
+import argparse
+import struct
+from gen_parser import Parser, Reg, Enum, mask, Error
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--xml', type=str, required=True)
+ parser.add_argument('--dump', type=str, required=True)
+
+ args = parser.parse_args()
+
+ p = Parser()
+
+ try:
+ p.parse("", args.xml)
+ except Error as e:
+ print(e, file=sys.stderr)
+ exit(1)
+
+ regs = {}
+ for e in p.file:
+ if isinstance(e, Reg):
+ regs[e.offset] = e
+
+ domains = {}
+ for e in p.file:
+ if isinstance(e, Enum):
+ if e.name == "target":
+ for name, val in e.values:
+ domains[name] = val
+
+ f = open(args.dump, mode='rb')
+ for i in range(0, os.path.getsize(args.dump) // 8):
+ cmd = f.read(8)
+ (offset, value, target) = struct.unpack("<hIh", cmd)
+ if offset in regs.keys():
+ reg = regs[offset]
+
+ if (target & 0xfffffffe) != domains[reg.domain]:
+ print("WARNING: target 0x%x doesn't match register's domain 0x%x" % (target, domains[reg.domain]))
+
+ print("EMIT(REG_%s, " % regs[offset].full_name.upper(), end="")
+ first = True
+ if value == 0 or len(reg.bitset.fields) == 1:
+ print("0x%x" % value, end="")
+ else:
+ for field in reg.bitset.fields:
+ if field.type == "boolean":
+ if 1 << field.high & value:
+ if not first:
+ print(" | ", end="")
+ print("%s_%s" % (reg.full_name.upper(), field.name.upper()), end="")
+ first = False
+ elif field.type == "uint":
+ field_value = (value & mask(field.low, field.high)) >> field.low
+ if field_value != 0:
+ if not first:
+ print(" | ", end="")
+ print("%s_%s(%d)" % (reg.full_name.upper(), field.name.upper(), field_value), end="")
+ first = False
+ print(");")
+ else:
+ print("%x %x %x" % (target, offset, value))
+
+if __name__ == '__main__':
+ main()
diff --git a/src/gallium/drivers/rocket/extract_registers.py b/src/gallium/drivers/rocket/extract_registers.py
new file mode 100644
index 00000000000..c804b6afb23
--- /dev/null
+++ b/src/gallium/drivers/rocket/extract_registers.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python3
+#
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import collections
+import csv
+import subprocess
+import sys
+from itertools import dropwhile
+import camelot
+
+trm_file = sys.argv[1]
+if trm_file.endswith(".pdf"):
+ data = subprocess.check_output(["pdftotext", "-tsv", sys.argv[1], "-"]).decode()
+else:
+ assert(trm_file.endswith(".txt"))
+ data = open(sys.argv[1]).read()
+
+data = csv.reader(data.splitlines(), delimiter="\t")
+data = collections.deque([x[11] for x in data])
+
+def popcell(data):
+ cell = []
+ while data[0] != "###FLOW###":
+ text = data.popleft()
+ cell.append(text)
+ data.popleft() ###FLOW###
+ data.popleft() ###LINE###
+ return cell
+
+text = None
+while data[0] != "RKNN_pc_operation_enable":
+ data.popleft()
+
+def read_reg_offset(data):
+ while data:
+ text = data.popleft()
+ if text.startswith("(0x"):
+ return text.replace("(", "").replace(")", "")
+
+reg_names = []
+offsets = []
+while text != "RKNN_global_operation_enable":
+ text = data.popleft()
+
+ if text.startswith("RKNN_"):
+ reg_names.append(text)
+ offsets.append(read_reg_offset(data))
+
+print("Found %d registers in RKNN block" % len(reg_names))
+
+"""
+print(reg_names)
+print(offsets)
+sys.exit(0)
+"""
+
+tables = camelot.read_pdf(sys.argv[1], line_scale=35, pages="0-60")
+tables = collections.deque([x.data for x in tables[3:]])
+
+# Join tables split by page breaks
+new_tables = []
+while tables:
+ new_table = tables.popleft()
+ last_bitfield = new_table[-1][0].split(" ")[0]
+ while last_bitfield != "0" and not last_bitfield.endswith(":0"):
+ second_part = tables.popleft()
+ new_table.extend(second_part[1:])
+ last_bitfield = second_part[-1][0].split(" ")[0]
+ new_tables.append(new_table)
+tables = new_tables
+print("Found %d tables in PDF" % len(tables))
+
+domains = {}
+for i in range(0, len(reg_names)):
+ reg_name = reg_names[i]
+ if "dpu_rdma" in reg_name:
+ domain = "dpu_rdma"
+ elif "ppu_rdma" in reg_name:
+ domain = "ppu_rdma"
+ else:
+ domain = reg_name.split("_")[1]
+ table = tables[i]
+
+ if domain not in domains.keys():
+ domains[domain] = []
+
+ reg = {}
+ reg["name"] = reg_name
+ reg["offset"] = offsets[i]
+ reg["field_names"] = []
+ reg["field_bits"] = []
+
+ reserved_count = 0
+ for row in table[1:]:
+ name = row[3].split('\n')[0]
+
+ if name == "reserved":
+ name = "reserved_%d" % reserved_count
+ reserved_count += 1
+
+ reg["field_bits"].append(row[0].split(' ')[0])
+ reg["field_names"].append(name)
+
+ domains[domain].append(reg)
+
+for domain in domains.keys():
+ print(' <domain name="%s" width="32">' % domain.upper())
+ for reg in domains[domain]:
+ print(' <reg32 offset="%s" name="%s">' % (reg["offset"], "_".join(reg["name"].strip().upper().split("_")[2:])))
+ for i in range(0, len(reg["field_names"])):
+ if ":" in reg["field_bits"][i]:
+ high, low = reg["field_bits"][i].split(":")
+ bits = 'low="%s" high="%s"' % (low, high)
+ else:
+ bits = 'pos="%s"' % reg["field_bits"][i]
+ print(' <bitfield name="%s" %s type="uint"/>' % (reg["field_names"][i].strip().upper(), bits))
+ print(' </reg32>')
+ print(' </domain>')
diff --git a/src/gallium/drivers/rocket/gen_header.py b/src/gallium/drivers/rocket/gen_header.py
new file mode 100644
index 00000000000..f3c6615dcb4
--- /dev/null
+++ b/src/gallium/drivers/rocket/gen_header.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+#
+# Copyright © 2019-2024 Google, Inc.
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+import os
+import argparse
+import time
+import datetime
+from gen_parser import Parser, Reg, Enum, mask, Error
+
+
+def dump_c(args, guard, func):
+ p = Parser()
+
+ try:
+ p.parse(args.rnn, args.xml)
+ except Error as e:
+ print(e, file=sys.stderr)
+ exit(1)
+
+ print("#ifndef %s\n#define %s\n" % (guard, guard))
+
+ print("""/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng gen_header.py tool in this git repository:
+http://gitlab.freedesktop.org/mesa/mesa/
+git clone https://gitlab.freedesktop.org/mesa/mesa.git
+
+The rules-ng-ng source files this header was generated from are:
+""")
+ maxlen = 0
+ for filepath in p.xml_files:
+ maxlen = max(maxlen, len(filepath))
+ for filepath in p.xml_files:
+ pad = " " * (maxlen - len(filepath))
+ filesize = str(os.path.getsize(filepath))
+ filesize = " " * (7 - len(filesize)) + filesize
+ filetime = time.ctime(os.path.getmtime(filepath))
+ print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")")
+ if p.copyright_year:
+ current_year = str(datetime.date.today().year)
+ print()
+ print("Copyright (C) %s-%s by the following authors:" % (p.copyright_year, current_year))
+ for author in p.authors:
+ print("- " + author)
+ if p.license:
+ print(p.license)
+ print("*/")
+
+ print()
+ print("#ifdef __KERNEL__")
+ print("#include <linux/bug.h>")
+ print("#define assert(x) BUG_ON(!(x))")
+ print("#else")
+ print("#include <assert.h>")
+ print("#endif")
+ print()
+
+ print("#ifdef __cplusplus")
+ print("#define __struct_cast(X)")
+ print("#else")
+ print("#define __struct_cast(X) (struct X)")
+ print("#endif")
+ print()
+
+ func(p)
+
+ print("static uint32_t rkt_get_target(uint32_t offset)")
+ print("{")
+
+ print("\tswitch(offset) {")
+ for e in p.file:
+ if isinstance(e, Reg):
+ print("\t\tcase REG_%s:" % e.full_name)
+ print("\t\t\treturn %s;" % e.domain)
+ print("\t}")
+ print("\treturn 0;")
+ print("}")
+
+ print("\n#endif /* %s */" % guard)
+
+
+def dump_c_defines(args):
+ guard = str.replace(os.path.basename(args.xml), '.', '_').upper()
+ dump_c(args, guard, lambda p: p.dump())
+
+
+def dump_c_pack_structs(args):
+ guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + '_STRUCTS'
+ dump_c(args, guard, lambda p: p.dump_structs())
+
+
+def dump_py_defines(args):
+ p = Parser()
+
+ try:
+ p.parse(args.rnn, args.xml)
+ except Error as e:
+ print(e, file=sys.stderr)
+ exit(1)
+
+ file_name = os.path.splitext(os.path.basename(args.xml))[0]
+
+ print("from enum import IntEnum")
+ print("class %sRegs(IntEnum):" % file_name.upper())
+
+ os.path.basename(args.xml)
+
+ p.dump_regs_py()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--rnn', type=str, required=True)
+ parser.add_argument('--xml', type=str, required=True)
+
+ subparsers = parser.add_subparsers(required=True)
+
+ parser_c_defines = subparsers.add_parser('c-defines')
+ parser_c_defines.set_defaults(func=dump_c_defines)
+
+ parser_c_pack_structs = subparsers.add_parser('c-pack-structs')
+ parser_c_pack_structs.set_defaults(func=dump_c_pack_structs)
+
+ parser_py_defines = subparsers.add_parser('py-defines')
+ parser_py_defines.set_defaults(func=dump_py_defines)
+
+ args = parser.parse_args()
+ args.func(args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/gallium/drivers/rocket/gen_parser.py b/src/gallium/drivers/rocket/gen_parser.py
new file mode 100644
index 00000000000..9ab1019b26f
--- /dev/null
+++ b/src/gallium/drivers/rocket/gen_parser.py
@@ -0,0 +1,737 @@
+import xml.parsers.expat
+import sys
+import os
+import collections
+
+class Error(Exception):
+ def __init__(self, message):
+ self.message = message
+
+class Enum(object):
+ def __init__(self, name):
+ self.name = name
+ self.values = []
+
+ def has_name(self, name):
+ for (n, value) in self.values:
+ if n == name:
+ return True
+ return False
+
+ def dump(self):
+ use_hex = False
+ for (name, value) in self.values:
+ if value > 0x1000:
+ use_hex = True
+
+ print("enum %s {" % self.name)
+ for (name, value) in self.values:
+ if use_hex:
+ print("\t%s = 0x%08x," % (name, value))
+ else:
+ print("\t%s = %d," % (name, value))
+ print("};\n")
+
+ def dump_pack_struct(self):
+ pass
+
+class Field(object):
+ def __init__(self, name, low, high, shr, type, parser):
+ self.name = name
+ self.low = low
+ self.high = high
+ self.shr = shr
+ self.type = type
+
+ builtin_types = [ None, "a3xx_regid", "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ]
+
+ maxpos = parser.current_bitsize - 1
+
+ if low < 0 or low > maxpos:
+ raise parser.error("low attribute out of range: %d" % low)
+ if high < 0 or high > maxpos:
+ raise parser.error("high attribute out of range: %d" % high)
+ if high < low:
+ raise parser.error("low is greater than high: low=%d, high=%d" % (low, high))
+ if self.type == "boolean" and not low == high:
+ raise parser.error("booleans should be 1 bit fields")
+ elif self.type == "float" and not (high - low == 31 or high - low == 15):
+ raise parser.error("floats should be 16 or 32 bit fields")
+ elif not self.type in builtin_types and not self.type in parser.enums:
+ raise parser.error("unknown type '%s'" % self.type)
+
+ def ctype(self, var_name):
+ if self.type == None:
+ type = "uint32_t"
+ val = var_name
+ elif self.type == "boolean":
+ type = "bool"
+ val = var_name
+ elif self.type == "uint" or self.type == "hex" or self.type == "a3xx_regid":
+ type = "uint32_t"
+ val = var_name
+ elif self.type == "int":
+ type = "int32_t"
+ val = var_name
+ elif self.type == "fixed":
+ type = "float"
+ val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
+ elif self.type == "ufixed":
+ type = "float"
+ val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
+ elif self.type == "float" and self.high - self.low == 31:
+ type = "float"
+ val = "fui(%s)" % var_name
+ elif self.type == "float" and self.high - self.low == 15:
+ type = "float"
+ val = "_mesa_float_to_half(%s)" % var_name
+ elif self.type in [ "address", "waddress" ]:
+ type = "uint64_t"
+ val = var_name
+ else:
+ type = "enum %s" % self.type
+ val = var_name
+
+ if self.shr > 0:
+ val = "(%s >> %d)" % (val, self.shr)
+
+ return (type, val)
+
+def tab_to(name, value):
+ tab_count = (68 - (len(name) & ~7)) // 8
+ if tab_count <= 0:
+ tab_count = 1
+ print(name + ('\t' * tab_count) + value)
+
+def mask(low, high):
+ return ((0xffffffffffffffff >> (64 - (high + 1 - low))) << low)
+
+def field_name(reg, f):
+ if f.name:
+ name = f.name.lower()
+ else:
+ # We hit this path when a reg is defined with no bitset fields, ie.
+ # <reg32 offset="0x88db" name="RB_BLIT_DST_ARRAY_PITCH" low="0" high="28" shr="6" type="uint"/>
+ name = reg.name.lower()
+
+ if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()):
+ name = "_" + name
+
+ return name
+
+class Bitset(object):
+ def __init__(self, name, template):
+ self.name = name
+ self.inline = False
+ if template:
+ self.fields = template.fields[:]
+ else:
+ self.fields = []
+
+ # Get address field if there is one in the bitset, else return None:
+ def get_address_field(self):
+ for f in self.fields:
+ if f.type in [ "address", "waddress" ]:
+ return f
+ return None
+
+ def dump_regpair_builder(self, reg):
+ print("#ifndef NDEBUG")
+ known_mask = 0
+ for f in self.fields:
+ known_mask |= mask(f.low, f.high)
+ if f.type in [ "boolean", "address", "waddress" ]:
+ continue
+ type, val = f.ctype("fields.%s" % field_name(reg, f))
+ print(" assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low)))
+ print(" assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask))
+ print("#endif\n")
+
+ print(" return (struct fd_reg_pair) {")
+ if reg.array:
+ print(" .reg = REG_%s(__i)," % reg.full_name)
+ else:
+ print(" .reg = REG_%s," % reg.full_name)
+
+ print(" .value =")
+ for f in self.fields:
+ if f.type in [ "address", "waddress" ]:
+ continue
+ else:
+ type, val = f.ctype("fields.%s" % field_name(reg, f))
+ print(" (%-40s << %2d) |" % (val, f.low))
+ value_name = "dword"
+ if reg.bit_size == 64:
+ value_name = "qword"
+ print(" fields.unknown | fields.%s," % (value_name,))
+
+ address = self.get_address_field()
+ if address:
+ print(" .bo = fields.bo,")
+ print(" .is_address = true,")
+ if f.type == "waddress":
+ print(" .bo_write = true,")
+ print(" .bo_offset = fields.bo_offset,")
+ print(" .bo_shift = %d," % address.shr)
+ print(" .bo_low = %d," % address.low)
+
+ print(" };")
+
+ def dump_pack_struct(self, reg=None):
+ if not reg:
+ return
+
+ prefix = reg.full_name
+
+ print("struct %s {" % prefix)
+ for f in self.fields:
+ if f.type in [ "address", "waddress" ]:
+ tab_to(" __bo_type", "bo;")
+ tab_to(" uint32_t", "bo_offset;")
+ continue
+ name = field_name(reg, f)
+
+ type, val = f.ctype("var")
+
+ tab_to(" %s" % type, "%s;" % name)
+ if reg.bit_size == 64:
+ tab_to(" uint64_t", "unknown;")
+ tab_to(" uint64_t", "qword;")
+ else:
+ tab_to(" uint32_t", "unknown;")
+ tab_to(" uint32_t", "dword;")
+ print("};\n")
+
+ if reg.array:
+ print("static inline struct fd_reg_pair\npack_%s(uint32_t __i, struct %s fields)\n{" %
+ (prefix, prefix))
+ else:
+ print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" %
+ (prefix, prefix))
+
+ self.dump_regpair_builder(reg)
+
+ print("\n}\n")
+
+ if self.get_address_field():
+ skip = ", { .reg = 0 }"
+ else:
+ skip = ""
+
+ if reg.array:
+ print("#define %s(__i, ...) pack_%s(__i, __struct_cast(%s) { __VA_ARGS__ })%s\n" %
+ (prefix, prefix, prefix, skip))
+ else:
+ print("#define %s(...) pack_%s(__struct_cast(%s) { __VA_ARGS__ })%s\n" %
+ (prefix, prefix, prefix, skip))
+
+
+ def dump(self, prefix=None):
+ if prefix == None:
+ prefix = self.name
+ for f in self.fields:
+ if f.name:
+ name = prefix + "_" + f.name
+ else:
+ name = prefix
+
+ if not f.name and f.low == 0 and f.shr == 0 and not f.type in ["float", "fixed", "ufixed"]:
+ pass
+ elif f.type == "boolean" or (f.type == None and f.low == f.high):
+ tab_to("#define %s" % name, "0x%08x" % (1 << f.low))
+ else:
+ tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high))
+ tab_to("#define %s__SHIFT" % name, "%d" % f.low)
+ type, val = f.ctype("val")
+
+ print("static inline uint32_t %s(%s val)\n{" % (name, type))
+ if f.shr > 0:
+ print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1))
+ print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name))
+ print()
+
+class Array(object):
+ def __init__(self, attrs, domain, variant):
+ if "name" in attrs:
+ self.name = attrs["name"]
+ else:
+ self.name = ""
+ self.domain = domain
+ self.variant = variant
+ self.offset = int(attrs["offset"], 0)
+ self.stride = int(attrs["stride"], 0)
+ self.length = int(attrs["length"], 0)
+ if "usage" in attrs:
+ self.usages = attrs["usage"].split(',')
+ else:
+ self.usages = None
+
+ def dump(self):
+ print("#define REG_%s_%s(i0) (0x%08x + 0x%x*(i0))\n" % (self.domain, self.name, self.offset, self.stride))
+
+ def dump_pack_struct(self):
+ pass
+
+ def dump_regpair_builder(self):
+ pass
+
+class Reg(object):
+ def __init__(self, attrs, domain, array, bit_size):
+ self.name = attrs["name"]
+ self.domain = domain
+ self.array = array
+ self.offset = int(attrs["offset"], 0)
+ self.type = None
+ self.bit_size = bit_size
+ if array:
+ self.name = array.name + "_" + self.name
+ self.full_name = self.domain + "_" + self.name
+
+ def dump(self):
+ if self.array:
+ offset = self.array.offset + self.offset
+ print("static inline uint32_t REG_%s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride))
+ else:
+ tab_to("#define REG_%s" % self.full_name, "0x%08x" % self.offset)
+
+ if self.bitset.inline:
+ self.bitset.dump(self.full_name)
+
+ def dump_pack_struct(self):
+ if self.bitset.inline:
+ self.bitset.dump_pack_struct(self)
+
+ def dump_regpair_builder(self):
+ if self.bitset.inline:
+ self.bitset.dump_regpair_builder(self)
+
+ def dump_py(self):
+ print("\tREG_%s = 0x%08x" % (self.full_name, self.offset))
+
+
+class Parser(object):
+ def __init__(self):
+ self.current_array = None
+ self.current_domain = None
+ self.current_prefix = None
+ self.current_prefix_type = None
+ self.current_stripe = None
+ self.current_bitset = None
+ self.current_bitsize = 32
+ # The varset attribute on the domain specifies the enum which
+ # specifies all possible hw variants:
+ self.current_varset = None
+ # Regs that have multiple variants.. we only generated the C++
+ # template based struct-packers for these
+ self.variant_regs = {}
+ # Information in which contexts regs are used, to be used in
+ # debug options
+ self.usage_regs = collections.defaultdict(list)
+ self.bitsets = {}
+ self.enums = {}
+ self.variants = set()
+ self.file = []
+ self.xml_files = []
+ self.copyright_year = None
+ self.authors = []
+ self.license = None
+
+ def error(self, message):
+ parser, filename = self.stack[-1]
+ return Error("%s:%d:%d: %s" % (filename, parser.CurrentLineNumber, parser.CurrentColumnNumber, message))
+
+ def prefix(self, variant=None):
+ if self.current_prefix_type == "variant" and variant:
+ return variant
+ elif self.current_stripe:
+ return self.current_stripe + "_" + self.current_domain
+ elif self.current_prefix:
+ return self.current_prefix + "_" + self.current_domain
+ else:
+ return self.current_domain
+
+ def parse_field(self, name, attrs):
+ try:
+ if "pos" in attrs:
+ high = low = int(attrs["pos"], 0)
+ elif "high" in attrs and "low" in attrs:
+ high = int(attrs["high"], 0)
+ low = int(attrs["low"], 0)
+ else:
+ low = 0
+ high = self.current_bitsize - 1
+
+ if "type" in attrs:
+ type = attrs["type"]
+ else:
+ type = None
+
+ if "shr" in attrs:
+ shr = int(attrs["shr"], 0)
+ else:
+ shr = 0
+
+ b = Field(name, low, high, shr, type, self)
+
+ if type == "fixed" or type == "ufixed":
+ b.radix = int(attrs["radix"], 0)
+
+ self.current_bitset.fields.append(b)
+ except ValueError as e:
+ raise self.error(e)
+
+ def parse_varset(self, attrs):
+ # Inherit the varset from the enclosing domain if not overriden:
+ varset = self.current_varset
+ if "varset" in attrs:
+ varset = self.enums[attrs["varset"]]
+ return varset
+
+ def parse_variants(self, attrs):
+ if not "variants" in attrs:
+ return None
+ variant = attrs["variants"].split(",")[0]
+ if "-" in variant:
+ variant = variant[:variant.index("-")]
+
+ varset = self.parse_varset(attrs)
+
+ assert varset.has_name(variant)
+
+ return variant
+
+ def add_all_variants(self, reg, attrs, parent_variant):
+ # TODO this should really handle *all* variants, including dealing
+ # with open ended ranges (ie. "A2XX,A4XX-") (we have the varset
+ # enum now to make that possible)
+ variant = self.parse_variants(attrs)
+ if not variant:
+ variant = parent_variant
+
+ if reg.name not in self.variant_regs:
+ self.variant_regs[reg.name] = {}
+ else:
+ # All variants must be same size:
+ v = next(iter(self.variant_regs[reg.name]))
+ assert self.variant_regs[reg.name][v].bit_size == reg.bit_size
+
+ self.variant_regs[reg.name][variant] = reg
+
+ def add_all_usages(self, reg, usages):
+ if not usages:
+ return
+
+ for usage in usages:
+ self.usage_regs[usage].append(reg)
+
+ self.variants.add(reg.domain)
+
+ def do_validate(self, schemafile):
+ try:
+ from lxml import etree
+
+ parser, filename = self.stack[-1]
+ dirname = os.path.dirname(filename)
+
+ # we expect this to look like <namespace url> schema.xsd.. I think
+ # technically it is supposed to be just a URL, but that doesn't
+ # quite match up to what we do.. Just skip over everything up to
+ # and including the first whitespace character:
+ schemafile = schemafile[schemafile.rindex(" ")+1:]
+
+ # this is a bit cheezy, but the xml file to validate could be
+ # in a child director, ie. we don't really know where the schema
+ # file is, the way the rnn C code does. So if it doesn't exist
+ # just look one level up
+ if not os.path.exists(dirname + "/" + schemafile):
+ schemafile = "../" + schemafile
+
+ if not os.path.exists(dirname + "/" + schemafile):
+ raise self.error("Cannot find schema for: " + filename)
+
+ xmlschema_doc = etree.parse(dirname + "/" + schemafile)
+ xmlschema = etree.XMLSchema(xmlschema_doc)
+
+ xml_doc = etree.parse(filename)
+ if not xmlschema.validate(xml_doc):
+ error_str = str(xmlschema.error_log.filter_from_errors()[0])
+ raise self.error("Schema validation failed for: " + filename + "\n" + error_str)
+ except ImportError:
+ print("lxml not found, skipping validation", file=sys.stderr)
+
+ def do_parse(self, filename):
+ filepath = os.path.abspath(filename)
+ if filepath in self.xml_files:
+ return
+ self.xml_files.append(filepath)
+ file = open(filename, "rb")
+ parser = xml.parsers.expat.ParserCreate()
+ self.stack.append((parser, filename))
+ parser.StartElementHandler = self.start_element
+ parser.EndElementHandler = self.end_element
+ parser.CharacterDataHandler = self.character_data
+ parser.buffer_text = True
+ parser.ParseFile(file)
+ self.stack.pop()
+ file.close()
+
+ def parse(self, rnn_path, filename):
+ self.path = rnn_path
+ self.stack = []
+ self.do_parse(filename)
+
+ def parse_reg(self, attrs, bit_size):
+ self.current_bitsize = bit_size
+ if "type" in attrs and attrs["type"] in self.bitsets:
+ bitset = self.bitsets[attrs["type"]]
+ if bitset.inline:
+ self.current_bitset = Bitset(attrs["name"], bitset)
+ self.current_bitset.inline = True
+ else:
+ self.current_bitset = bitset
+ else:
+ self.current_bitset = Bitset(attrs["name"], None)
+ self.current_bitset.inline = True
+ if "type" in attrs:
+ self.parse_field(None, attrs)
+
+ variant = self.parse_variants(attrs)
+ if not variant and self.current_array:
+ variant = self.current_array.variant
+
+ self.current_reg = Reg(attrs, self.prefix(variant), self.current_array, bit_size)
+ self.current_reg.bitset = self.current_bitset
+
+ if len(self.stack) == 1:
+ self.file.append(self.current_reg)
+
+ if variant is not None:
+ self.add_all_variants(self.current_reg, attrs, variant)
+
+ usages = None
+ if "usage" in attrs:
+ usages = attrs["usage"].split(',')
+ elif self.current_array:
+ usages = self.current_array.usages
+
+ self.add_all_usages(self.current_reg, usages)
+
+ def start_element(self, name, attrs):
+ self.cdata = ""
+ if name == "import":
+ filename = attrs["file"]
+ self.do_parse(os.path.join(self.path, filename))
+ elif name == "domain":
+ self.current_domain = attrs["name"]
+ if "prefix" in attrs:
+ self.current_prefix = self.parse_variants(attrs)
+ self.current_prefix_type = attrs["prefix"]
+ else:
+ self.current_prefix = None
+ self.current_prefix_type = None
+ if "varset" in attrs:
+ self.current_varset = self.enums[attrs["varset"]]
+ elif name == "stripe":
+ self.current_stripe = self.parse_variants(attrs)
+ elif name == "enum":
+ self.current_enum_value = 0
+ self.current_enum = Enum(attrs["name"])
+ self.enums[attrs["name"]] = self.current_enum
+ if len(self.stack) == 1:
+ self.file.append(self.current_enum)
+ elif name == "value":
+ if "value" in attrs:
+ value = int(attrs["value"], 0)
+ else:
+ value = self.current_enum_value
+ self.current_enum.values.append((attrs["name"], value))
+ elif name == "reg32":
+ self.parse_reg(attrs, 32)
+ elif name == "reg64":
+ self.parse_reg(attrs, 64)
+ elif name == "array":
+ self.current_bitsize = 32
+ variant = self.parse_variants(attrs)
+ self.current_array = Array(attrs, self.prefix(variant), variant)
+ if len(self.stack) == 1:
+ self.file.append(self.current_array)
+ elif name == "bitset":
+ self.current_bitset = Bitset(attrs["name"], None)
+ if "inline" in attrs and attrs["inline"] == "yes":
+ self.current_bitset.inline = True
+ self.bitsets[self.current_bitset.name] = self.current_bitset
+ if len(self.stack) == 1 and not self.current_bitset.inline:
+ self.file.append(self.current_bitset)
+ elif name == "bitfield" and self.current_bitset:
+ self.parse_field(attrs["name"], attrs)
+ elif name == "database":
+ self.do_validate(attrs["xsi:schemaLocation"])
+ elif name == "copyright":
+ self.copyright_year = attrs["year"]
+ elif name == "author":
+ self.authors.append(attrs["name"] + " <" + attrs["email"] + "> " + attrs["name"])
+
+ def end_element(self, name):
+ if name == "domain":
+ self.current_domain = None
+ self.current_prefix = None
+ self.current_prefix_type = None
+ elif name == "stripe":
+ self.current_stripe = None
+ elif name == "bitset":
+ self.current_bitset = None
+ elif name == "reg32":
+ self.current_reg = None
+ elif name == "array":
+ self.current_array = None
+ elif name == "enum":
+ self.current_enum = None
+ elif name == "license":
+ self.license = self.cdata
+
+ def character_data(self, data):
+ self.cdata += data
+
+ def dump_reg_usages(self):
+ d = collections.defaultdict(list)
+ for usage, regs in self.usage_regs.items():
+ for reg in regs:
+ variants = self.variant_regs.get(reg.name)
+ if variants:
+ for variant, vreg in variants.items():
+ if reg == vreg:
+ d[(usage, variant)].append(reg)
+ else:
+ for variant in self.variants:
+ d[(usage, variant)].append(reg)
+
+ print("#ifdef __cplusplus")
+
+ for usage, regs in self.usage_regs.items():
+ print("template<chip CHIP> constexpr inline uint16_t %s_REGS[] = {};" % (usage.upper()))
+
+ for (usage, variant), regs in d.items():
+ offsets = []
+
+ for reg in regs:
+ if reg.array:
+ for i in range(reg.array.length):
+ offsets.append(reg.array.offset + reg.offset + i * reg.array.stride)
+ if reg.bit_size == 64:
+ offsets.append(offsets[-1] + 1)
+ else:
+ offsets.append(reg.offset)
+ if reg.bit_size == 64:
+ offsets.append(offsets[-1] + 1)
+
+ offsets.sort()
+
+ print("template<> constexpr inline uint16_t %s_REGS<%s>[] = {" % (usage.upper(), variant))
+ for offset in offsets:
+ print("\t%s," % hex(offset))
+ print("};")
+
+ print("#endif")
+
+ def dump(self):
+ enums = []
+ bitsets = []
+ regs = []
+ for e in self.file:
+ if isinstance(e, Enum):
+ enums.append(e)
+ elif isinstance(e, Bitset):
+ bitsets.append(e)
+ else:
+ regs.append(e)
+
+ for e in enums + bitsets + regs:
+ e.dump()
+
+ self.dump_reg_usages()
+
+
+ def dump_regs_py(self):
+ regs = []
+ for e in self.file:
+ if isinstance(e, Reg):
+ regs.append(e)
+
+ for e in regs:
+ e.dump_py()
+
+
+ def dump_reg_variants(self, regname, variants):
+ # Don't bother for things that only have a single variant:
+ if len(variants) == 1:
+ return
+ print("#ifdef __cplusplus")
+ print("struct __%s {" % regname)
+ # TODO be more clever.. we should probably figure out which
+ # fields have the same type in all variants (in which they
+ # appear) and stuff everything else in a variant specific
+ # sub-structure.
+ seen_fields = []
+ bit_size = 32
+ array = False
+ address = None
+ for variant in variants.keys():
+ print(" /* %s fields: */" % variant)
+ reg = variants[variant]
+ bit_size = reg.bit_size
+ array = reg.array
+ for f in reg.bitset.fields:
+ fld_name = field_name(reg, f)
+ if fld_name in seen_fields:
+ continue
+ seen_fields.append(fld_name)
+ name = fld_name.lower()
+ if f.type in [ "address", "waddress" ]:
+ if address:
+ continue
+ address = f
+ tab_to(" __bo_type", "bo;")
+ tab_to(" uint32_t", "bo_offset;")
+ continue
+ type, val = f.ctype("var")
+ tab_to(" %s" %type, "%s;" %name)
+ print(" /* fallback fields: */")
+ if bit_size == 64:
+ tab_to(" uint64_t", "unknown;")
+ tab_to(" uint64_t", "qword;")
+ else:
+ tab_to(" uint32_t", "unknown;")
+ tab_to(" uint32_t", "dword;")
+ print("};")
+ # TODO don't hardcode the varset enum name
+ varenum = "chip"
+ print("template <%s %s>" % (varenum, varenum.upper()))
+ print("static inline struct fd_reg_pair")
+ xtra = ""
+ xtravar = ""
+ if array:
+ xtra = "int __i, "
+ xtravar = "__i, "
+ print("__%s(%sstruct __%s fields) {" % (regname, xtra, regname))
+ for variant in variants.keys():
+ print(" if (%s == %s) {" % (varenum.upper(), variant))
+ reg = variants[variant]
+ reg.dump_regpair_builder()
+ print(" } else")
+ print(" assert(!\"invalid variant\");")
+ print("}")
+
+ if bit_size == 64:
+ skip = ", { .reg = 0 }"
+ else:
+ skip = ""
+
+ print("#define %s(VARIANT, %s...) __%s<VARIANT>(%s{__VA_ARGS__})%s" % (regname, xtravar, regname, xtravar, skip))
+ print("#endif /* __cplusplus */")
+
+ def dump_structs(self):
+ for e in self.file:
+ e.dump_pack_struct()
+
+ for regname in self.variant_regs:
+ self.dump_reg_variants(regname, self.variant_regs[regname])
diff --git a/src/gallium/drivers/rocket/intercept.c b/src/gallium/drivers/rocket/intercept.c
new file mode 100644
index 00000000000..6ffb8647d61
--- /dev/null
+++ b/src/gallium/drivers/rocket/intercept.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "drm-uapi/rknpu_ioctl.h"
+#include "rkt_registers.h"
+
+// #define GETENV 1
+
+struct bo {
+ int handle;
+ unsigned size;
+ uint64_t obj_addr;
+ uint64_t dma_addr;
+};
+
+#define MAX_BOS 3000
+
+struct context {
+ int dump_file;
+ int device_fd;
+ struct bo bos[MAX_BOS];
+ unsigned next_handle_id;
+};
+
+struct context context = {0};
+
+static void
+dump_log(const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+
+ int dump_fd = open("rknpu.log", O_CREAT | O_RDWR | O_APPEND,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ vdprintf(dump_fd, format, args);
+ close(dump_fd);
+
+ va_end(args);
+}
+
+#define L(...) dump_log(__VA_ARGS__);
+
+static void *
+map_bo(struct bo *bo)
+{
+ struct rknpu_mem_map req = {0};
+
+ req.handle = bo->handle;
+ ioctl(context.device_fd, DRM_IOCTL_RKNPU_MEM_MAP, &req);
+ return mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ context.device_fd, req.offset);
+}
+
+static struct bo *
+find_bo(uint64_t dma_address, unsigned *offset)
+{
+ for (int j = 0; j < context.next_handle_id; j++) {
+ fprintf(stderr, "needle %lx hay %lx i %d\n", dma_address,
+ context.bos[j].dma_addr, j);
+ if (dma_address >= context.bos[j].dma_addr &&
+ dma_address < context.bos[j].dma_addr + context.bos[j].size) {
+ *offset = dma_address - context.bos[j].dma_addr;
+ return &context.bos[j];
+ }
+ }
+
+ return NULL;
+}
+
+static void
+dump_buffer(const char *name, uint64_t dma_address, unsigned size)
+{
+ unsigned offset = 0;
+ struct bo *bo = find_bo(dma_address, &offset);
+
+ fprintf(stderr, "dump_buffer name %s dma 0x%lx size %u bo %p\n", name,
+ dma_address, size, bo);
+
+ if (size == 0 || size + offset > bo->size)
+ size = bo->size - offset;
+
+ int fd = open(name, O_CREAT | O_RDWR | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ write(fd, map_bo(bo) + offset, size);
+ close(fd);
+}
+
+static unsigned task_id = 0;
+
+static int
+handle_submit(struct rknpu_submit *args, uint32_t *output_address)
+{
+ int ret = 0;
+
+ L("struct rknpu_submit submit = {\n");
+ L(" .flags = %x,\n", args->flags);
+ L(" .timeout = %d,\n", args->timeout);
+ L(" .task_start = %d,\n", args->task_start);
+ L(" .task_number = %d,\n", args->task_number);
+ L(" .task_counter = %d,\n", args->task_counter);
+ L(" .priority = %d,\n", args->priority);
+ L(" .task_obj_addr = 0x%llx,\n", args->task_obj_addr);
+ L(" .regcfg_obj_addr = 0x%llx,\n", args->regcfg_obj_addr);
+ L(" .task_base_addr = 0x%llx,\n", args->task_base_addr);
+ L(" .user_data = 0x%llx,\n", args->user_data);
+ L(" .core_mask = %x,\n", args->core_mask);
+ L(" .fence_fd = %d,\n", args->fence_fd);
+ L(" .subcore_task = {\n");
+ L(" {\n");
+ L(" .task_start = %d,\n", args->subcore_task[0].task_start);
+ L(" .task_number = %d,\n", args->subcore_task[0].task_number);
+ L(" },\n");
+ L(" {\n");
+ L(" .task_start = %d,\n", args->subcore_task[1].task_start);
+ L(" .task_number = %d,\n", args->subcore_task[1].task_number);
+ L(" },\n");
+ L(" {\n");
+ L(" .task_start = %d,\n", args->subcore_task[2].task_start);
+ L(" .task_number = %d,\n", args->subcore_task[2].task_number);
+ L(" },\n");
+ L(" },\n");
+ L("};\n");
+
+ struct bo *task_bo = NULL;
+ for (int i = 0; i < context.next_handle_id; i++) {
+ if (context.bos[i].obj_addr == args->task_obj_addr) {
+ task_bo = &context.bos[i];
+ break;
+ }
+ }
+
+ struct rknpu_task *tasks = map_bo(task_bo);
+ for (int i = args->task_start; i < args->task_start + args->task_number / 3;
+ i++) {
+ L("tasks[%d].flags = 0x%x;\n", i, tasks[i].flags);
+ L("tasks[%d].op_idx = %d;\n", i, tasks[i].op_idx);
+ L("tasks[%d].enable_mask = 0x%x;\n", i, tasks[i].enable_mask);
+ L("tasks[%d].int_mask = 0x%x;\n", i, tasks[i].int_mask);
+ L("tasks[%d].int_clear = 0x%x;\n", i, tasks[i].int_clear);
+ L("tasks[%d].regcfg_amount = %d;\n", i, tasks[i].regcfg_amount);
+ L("tasks[%d].regcfg_offset = 0x%x;\n", i, tasks[i].regcfg_offset);
+ L("tasks[%d].regcmd_addr = 0x%llx;\n", i, tasks[i].regcmd_addr);
+
+ if (tasks[i].regcmd_addr == 0x0)
+ continue;
+
+ char name[PATH_MAX];
+ unsigned size = (tasks[i].regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT) *
+ sizeof(uint64_t);
+ sprintf(name, "regcmd%d.bin", task_id);
+ dump_buffer(name, tasks[i].regcmd_addr + tasks[i].regcfg_offset, size);
+
+ uint32_t input_address = 0x0;
+ *output_address = 0x0;
+ uint32_t weights_address = 0x0;
+ uint32_t biases_address = 0x0;
+ uint32_t eltwise_address = 0x0;
+
+ unsigned offset = 0;
+ struct bo *bo =
+ find_bo(tasks[i].regcmd_addr + tasks[i].regcfg_offset, &offset);
+ uint64_t *regcmd = map_bo(bo) + offset;
+ for (int j = 0; j < tasks[i].regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT;
+ j++) {
+ switch (regcmd[j] & 0xffff) {
+ case REG_CNA_FEATURE_DATA_ADDR:
+ input_address = (regcmd[j] & 0xffffffff0000) >> 16;
+ break;
+ case REG_CNA_DCOMP_ADDR0:
+ weights_address = (regcmd[j] & 0xffffffff0000) >> 16;
+ break;
+ case REG_DPU_DST_BASE_ADDR:
+ if (*output_address == 0x0)
+ *output_address = (regcmd[j] & 0xffffffff0000) >> 16;
+ break;
+ case REG_DPU_RDMA_RDMA_BS_BASE_ADDR:
+ biases_address = (regcmd[j] & 0xffffffff0000) >> 16;
+ break;
+ case REG_DPU_RDMA_RDMA_EW_BASE_ADDR:
+ eltwise_address = (regcmd[j] & 0xffffffff0000) >> 16;
+ break;
+ }
+ }
+
+ fprintf(stderr, "weights_address %x\n", weights_address);
+ fprintf(stderr, "input_address %x\n", input_address);
+ fprintf(stderr, "output_address %x\n", *output_address);
+ fprintf(stderr, "biases_address %x\n", biases_address);
+ fprintf(stderr, "eltwise_address %x\n", eltwise_address);
+
+ if (weights_address != 0x0) {
+ sprintf(name, "weights%d.bin", task_id);
+ dump_buffer(name, weights_address, 0);
+ }
+
+ if (biases_address != 0x0) {
+ sprintf(name, "biases%d.bin", task_id);
+ dump_buffer(name, biases_address, 0);
+ }
+
+ if (eltwise_address != 0x0) {
+ sprintf(name, "eltwise%d.bin", task_id);
+ dump_buffer(name, eltwise_address, 0);
+ }
+
+ if (input_address != 0x0) {
+ sprintf(name, "input%d.bin", task_id);
+ dump_buffer(name, input_address, 0);
+ }
+
+ task_id++;
+ }
+
+ return ret;
+}
+
+static void
+handle_mem_sync(struct rknpu_mem_sync *args)
+{
+ L("struct rknpu_mem_sync sync = {\n");
+ L(" .flags = 0x%x,\n", args->flags);
+ L(" .reserved = 0x%x,\n", args->reserved);
+ L(" .obj_addr = 0x%llx,\n", args->obj_addr);
+ L(" .offset = 0x%llx,\n", args->offset);
+ L(" .size = %llu,\n", args->size);
+ L("};\n");
+}
+
+static int
+handle_mem_create(struct rknpu_mem_create *args)
+{
+ int ret = 0;
+
+#if 0
+ L("struct rknpu_mem_create create = {\n");
+ L(" .dma_addr = 0x%llx,\n", args->dma_addr);
+ L(" .flags = 0x%x,\n", args->flags);
+ L(" .handle = %u,\n", args->handle);
+ L(" .obj_addr = 0x%llx,\n", args->obj_addr);
+ L(" .size = %llu,\n", args->size);
+ L("};\n");
+#endif
+
+ assert(context.next_handle_id < MAX_BOS);
+
+ context.bos[context.next_handle_id].handle = args->handle;
+ context.bos[context.next_handle_id].size = args->size;
+ context.bos[context.next_handle_id].obj_addr = args->obj_addr;
+ context.bos[context.next_handle_id].dma_addr = args->dma_addr;
+
+ fprintf(stderr, "%s: dma_addr %llx\n", __func__, args->dma_addr);
+ context.next_handle_id++;
+
+ return ret;
+}
+
+static void
+handle_action(struct rknpu_action *args)
+{
+ switch (args->flags) {
+ case RKNPU_GET_HW_VERSION:
+ L("%s: RKNPU_GET_HW_VERSION %x\n", __func__, args->value);
+ break;
+ case RKNPU_GET_DRV_VERSION:
+ L("%s: RKNPU_GET_DRV_VERSION %x\n", __func__, args->value);
+ break;
+ case RKNPU_POWER_ON:
+ L("%s: RKNPU_POWER_ON %x\n", __func__, args->value);
+ break;
+ case RKNPU_GET_IOMMU_EN:
+ L("%s: RKNPU_GET_IOMMU_EN %x\n", __func__, args->value);
+ break;
+ case RKNPU_SET_PROC_NICE:
+ L("%s: RKNPU_SET_PROC_NICE %x\n", __func__, args->value);
+ break;
+ case RKNPU_GET_FREQ:
+ L("%s: RKNPU_GET_FREQ %x\n", __func__, args->value);
+ break;
+ default:
+ L("%s: unhandled action %d %x\n", __func__, args->flags, args->value);
+ break;
+ }
+}
+
+typedef int (*real_ioctl_t)(int fd, unsigned long request, ...);
+int
+ioctl(int fd, unsigned long request, ...)
+{
+ int ret;
+ uint32_t output_address = 0;
+
+ va_list ap;
+ va_start(ap, request);
+ void *ptr_ = va_arg(ap, void *);
+ va_end(ap);
+
+ real_ioctl_t real_ioctl;
+ real_ioctl = (real_ioctl_t)dlsym(RTLD_NEXT, "ioctl");
+
+ switch (request) {
+ case DRM_IOCTL_RKNPU_SUBMIT:
+ handle_submit(ptr_, &output_address);
+ break;
+ case DRM_IOCTL_RKNPU_MEM_SYNC:
+ // handle_mem_sync(ptr_);
+ break;
+ case DRM_IOCTL_RKNPU_ACTION:
+ // handle_action(ptr_);
+ break;
+ }
+
+ ret = real_ioctl(fd, request, ptr_);
+
+ switch (request) {
+ case DRM_IOCTL_RKNPU_SUBMIT: {
+ char name[PATH_MAX];
+ sprintf(name, "output%d.bin", task_id);
+ dump_buffer(name, output_address, 0);
+
+ break;
+ }
+ case DRM_IOCTL_RKNPU_MEM_CREATE:
+ case IOCTL_RKNPU_MEM_CREATE:
+ case 0xc0286442:
+ handle_mem_create(ptr_);
+ context.device_fd = fd;
+ break;
+ }
+
+ return ret;
+}
+
+/* Intended to be called from GDB when the underlying memory is not directly
+ * accessible to it. */
+void dump_mem(uint32_t *ptr, unsigned bytes);
+
+void
+dump_mem(uint32_t *ptr, unsigned bytes)
+{
+ for (int i = 0; i < bytes / 4; i++) {
+ fprintf(stderr, "%08x %08x %08x %08x\n", ptr[0], ptr[1], ptr[2], ptr[3]);
+ ptr += 4;
+ }
+}
+
+#ifdef GETENV
+typedef char *(*real_getenv_t)(const char *name);
+char *
+getenv(const char *name)
+{
+ real_getenv_t real_getenv;
+ real_getenv = (real_getenv_t)dlsym(RTLD_NEXT, "getenv");
+
+ fprintf(stderr, "getenv %s\n", name);
+
+ return real_getenv(name);
+}
+
+#endif
diff --git a/src/gallium/drivers/rocket/meson.build b/src/gallium/drivers/rocket/meson.build
new file mode 100644
index 00000000000..f327154e328
--- /dev/null
+++ b/src/gallium/drivers/rocket/meson.build
@@ -0,0 +1,38 @@
+# Copyright 2019 Google, Inc
+# SPDX-License-Identifier: MIT
+
+rocket_registers = custom_target(
+ 'rkt_registers.h',
+ input : ['gen_header.py', 'registers.xml'],
+ output : 'rkt_registers.h',
+ command : [prog_python, '@INPUT0@', '--rnn', '.', '--xml', '@INPUT1@', 'c-defines'],
+ capture : true,
+)
+
+files_rocket = files(
+ 'rkt_coefs.c',
+ 'rkt_device.c',
+ 'rkt_ml.c',
+ 'rkt_regcmd.c',
+ 'rkt_task.c',
+)
+
+librocket = static_library(
+ 'rocket',
+ [files_rocket, rocket_registers],
+ include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src],
+ gnu_symbol_visibility : 'hidden',
+ dependencies : [idep_mesautil, dep_libdrm],
+)
+
+driver_rocket = declare_dependency(
+ compile_args : '-DGALLIUM_ROCKET',
+ link_with : [librocketwinsys, librocket]
+)
+
+shared_library('intercept',
+ [files('intercept.c'), rocket_registers],
+ include_directories : [inc_include],
+ dependencies : [dep_libdrm],
+ c_args: ['-Wno-error=missing-prototypes', '-g', '-O0']
+)
diff --git a/src/gallium/drivers/rocket/registers.xml b/src/gallium/drivers/rocket/registers.xml
new file mode 100644
index 00000000000..8410edab5f4
--- /dev/null
+++ b/src/gallium/drivers/rocket/registers.xml
@@ -0,0 +1,1179 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<database xmlns="http://nouveau.freedesktop.org/"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
+
+<copyright year="2024">
+
+<author name="Tomeu Vizoso" email="tomeu@tomeuvizoso.net"><nick name="tomeu"/>
+Initial Author.
+</author>
+
+<license>
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+</license>
+
+</copyright>
+
+<enum name="target">
+ <value name="PC" value="0x100"/>
+ <value name="CNA" value="0x200"/>
+ <value name="CORE" value="0x800"/>
+ <value name="DPU" value="0x1000"/>
+ <value name="DPU_RDMA" value="0x2000"/>
+ <value name="PPU" value="0x4000"/>
+ <value name="PPU_RDMA" value="0x8000"/>
+ <value name="DDMA" value="0x10000"/>
+ <value name="SDMA" value="0x20000"/>
+ <value name="GLOBAL" value="0x40000"/>
+</enum>
+
+<domain name="PC" width="32">
+ <reg32 offset="0x0000" name="VERSION">
+ <bitfield name="VERSION" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x0004" name="VERSION_NUM">
+ <bitfield name="VERSION_NUM" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x0008" name="OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x0010" name="BASE_ADDRESS">
+ <bitfield name="PC_SOURCE_ADDR" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="1" high="3" type="uint"/>
+ <bitfield name="PC_SEL" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x0014" name="REGISTER_AMOUNTS">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="PC_DATA_AMOUNT" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x0020" name="INTERRUPT_MASK">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/>
+ <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/>
+ <bitfield name="PPU_1" pos="11" type="boolean"/>
+ <bitfield name="PPU_0" pos="10" type="boolean"/>
+ <bitfield name="DPU_1" pos="9" type="boolean"/>
+ <bitfield name="DPU_0" pos="8" type="boolean"/>
+ <bitfield name="CORE_1" pos="7" type="boolean"/>
+ <bitfield name="CORE_0" pos="6" type="boolean"/>
+ <bitfield name="CNA_CSC_1" pos="5" type="boolean"/>
+ <bitfield name="CNA_CSC_0" pos="4" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/>
+ <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/>
+ <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/>
+ </reg32>
+ <reg32 offset="0x0024" name="INTERRUPT_CLEAR">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/>
+ <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/>
+ <bitfield name="PPU_1" pos="11" type="boolean"/>
+ <bitfield name="PPU_0" pos="10" type="boolean"/>
+ <bitfield name="DPU_1" pos="9" type="boolean"/>
+ <bitfield name="DPU_0" pos="8" type="boolean"/>
+ <bitfield name="CORE_1" pos="7" type="boolean"/>
+ <bitfield name="CORE_0" pos="6" type="boolean"/>
+ <bitfield name="CNA_CSC_1" pos="5" type="boolean"/>
+ <bitfield name="CNA_CSC_0" pos="4" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/>
+ <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/>
+ <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/>
+ </reg32>
+ <reg32 offset="0x0028" name="INTERRUPT_STATUS">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/>
+ <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/>
+ <bitfield name="PPU_1" pos="11" type="boolean"/>
+ <bitfield name="PPU_0" pos="10" type="boolean"/>
+ <bitfield name="DPU_1" pos="9" type="boolean"/>
+ <bitfield name="DPU_0" pos="8" type="boolean"/>
+ <bitfield name="CORE_1" pos="7" type="boolean"/>
+ <bitfield name="CORE_0" pos="6" type="boolean"/>
+ <bitfield name="CNA_CSC_1" pos="5" type="boolean"/>
+ <bitfield name="CNA_CSC_0" pos="4" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/>
+ <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/>
+ <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/>
+ </reg32>
+ <reg32 offset="0x002C" name="INTERRUPT_RAW_STATUS">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/>
+ <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/>
+ <bitfield name="PPU_1" pos="11" type="boolean"/>
+ <bitfield name="PPU_0" pos="10" type="boolean"/>
+ <bitfield name="DPU_1" pos="9" type="boolean"/>
+ <bitfield name="DPU_0" pos="8" type="boolean"/>
+ <bitfield name="CORE_1" pos="7" type="boolean"/>
+ <bitfield name="CORE_0" pos="6" type="boolean"/>
+ <bitfield name="CNA_CSC_1" pos="5" type="boolean"/>
+ <bitfield name="CNA_CSC_0" pos="4" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/>
+ <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/>
+ <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/>
+ <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/>
+ </reg32>
+ <reg32 offset="0x0030" name="TASK_CON">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="TASK_COUNT_CLEAR" pos="13" type="uint"/>
+ <bitfield name="TASK_PP_EN" pos="12" type="uint"/>
+ <bitfield name="TASK_NUMBER" low="0" high="11" type="uint"/>
+ </reg32>
+ <reg32 offset="0x0034" name="TASK_DMA_BASE_ADDR">
+ <bitfield name="DMA_BASE_ADDR" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x003C" name="TASK_STATUS">
+ <bitfield name="RESERVED_0" low="28" high="31" type="uint"/>
+ <bitfield name="TASK_STATUS" low="0" high="27" type="uint"/>
+ </reg32>
+</domain>
+<domain name="CNA" width="32">
+ <reg32 offset="0x1000" name="S_STATUS">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="STATUS_1" low="16" high="17" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="15" type="uint"/>
+ <bitfield name="STATUS_0" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1004" name="S_POINTER">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="EXECUTER" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="6" high="15" type="uint"/>
+ <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/>
+ <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/>
+ <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/>
+ <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/>
+ <bitfield name="POINTER_PP_EN" pos="1" type="uint"/>
+ <bitfield name="POINTER" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1008" name="OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x100C" name="CONV_CON1">
+ <bitfield name="RESERVED_0" pos="31" type="uint"/>
+ <bitfield name="NONALIGN_DMA" pos="30" type="uint"/>
+ <bitfield name="GROUP_LINE_OFF" pos="29" type="uint"/>
+ <bitfield name="RESERVED_1" low="17" high="28" type="uint"/>
+ <bitfield name="DECONV" pos="16" type="uint"/>
+ <bitfield name="ARGB_IN" low="12" high="15" type="uint"/>
+ <bitfield name="RESERVED_2" low="10" high="11" type="uint"/>
+ <bitfield name="PROC_PRECISION" low="7" high="9" type="uint"/>
+ <bitfield name="IN_PRECISION" low="4" high="6" type="uint"/>
+ <bitfield name="CONV_MODE" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1010" name="CONV_CON2">
+ <bitfield name="RESERVED_0" low="24" high="31" type="uint"/>
+ <bitfield name="KERNEL_GROUP" low="16" high="23" type="uint"/>
+ <bitfield name="RESERVED_1" low="14" high="15" type="uint"/>
+ <bitfield name="FEATURE_GRAINS" low="4" high="13" type="uint"/>
+ <bitfield name="RESERVED_2" pos="3" type="uint"/>
+ <bitfield name="CSC_WO_EN" pos="2" type="uint"/>
+ <bitfield name="CSC_DO_EN" pos="1" type="uint"/>
+ <bitfield name="CMD_FIFO_SRST" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1014" name="CONV_CON3">
+ <bitfield name="RESERVED_0" pos="31" type="uint"/>
+ <bitfield name="NN_MODE" low="28" high="30" type="uint"/>
+ <bitfield name="RESERVED_1" low="26" high="27" type="uint"/>
+ <bitfield name="ATROUS_Y_DILATION" low="21" high="25" type="uint"/>
+ <bitfield name="ATROUS_X_DILATION" low="16" high="20" type="uint"/>
+ <bitfield name="RESERVED_2" low="14" high="15" type="uint"/>
+ <bitfield name="DECONV_Y_STRIDE" low="11" high="13" type="uint"/>
+ <bitfield name="DECONV_X_STRIDE" low="8" high="10" type="uint"/>
+ <bitfield name="RESERVED_3" low="6" high="7" type="uint"/>
+ <bitfield name="CONV_Y_STRIDE" low="3" high="5" type="uint"/>
+ <bitfield name="CONV_X_STRIDE" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1020" name="DATA_SIZE0">
+ <bitfield name="RESERVED_0" low="27" high="31" type="uint"/>
+ <bitfield name="DATAIN_WIDTH" low="16" high="26" type="uint"/>
+ <bitfield name="RESERVED_1" low="11" high="15" type="uint"/>
+ <bitfield name="DATAIN_HEIGHT" low="0" high="10" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1024" name="DATA_SIZE1">
+ <bitfield name="RESERVED_0" low="30" high="31" type="uint"/>
+ <bitfield name="DATAIN_CHANNEL_REAL" low="16" high="29" type="uint"/>
+ <bitfield name="DATAIN_CHANNEL" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1028" name="DATA_SIZE2">
+ <bitfield name="RESERVED_0" low="11" high="31" type="uint"/>
+ <bitfield name="DATAOUT_WIDTH" low="0" high="10" type="uint"/>
+ </reg32>
+ <reg32 offset="0x102C" name="DATA_SIZE3">
+ <bitfield name="RESERVED_0" low="24" high="31" type="uint"/>
+ <bitfield name="SURF_MODE" low="22" high="23" type="uint"/>
+ <bitfield name="DATAOUT_ATOMICS" low="0" high="21" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1030" name="WEIGHT_SIZE0">
+ <bitfield name="WEIGHT_BYTES" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1034" name="WEIGHT_SIZE1">
+ <bitfield name="RESERVED_0" low="19" high="31" type="uint"/>
+ <bitfield name="WEIGHT_BYTES_PER_KERNEL" low="0" high="18" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1038" name="WEIGHT_SIZE2">
+ <bitfield name="RESERVED_0" low="29" high="31" type="uint"/>
+ <bitfield name="WEIGHT_WIDTH" low="24" high="28" type="uint"/>
+ <bitfield name="RESERVED_1" low="21" high="23" type="uint"/>
+ <bitfield name="WEIGHT_HEIGHT" low="16" high="20" type="uint"/>
+ <bitfield name="RESERVED_2" low="14" high="15" type="uint"/>
+ <bitfield name="WEIGHT_KERNELS" low="0" high="13" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1040" name="CBUF_CON0">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="WEIGHT_REUSE" pos="13" type="uint"/>
+ <bitfield name="DATA_REUSE" pos="12" type="uint"/>
+ <bitfield name="RESERVED_1" pos="11" type="uint"/>
+ <bitfield name="FC_DATA_BANK" low="8" high="10" type="uint"/>
+ <bitfield name="WEIGHT_BANK" low="4" high="7" type="uint"/>
+ <bitfield name="DATA_BANK" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1044" name="CBUF_CON1">
+ <bitfield name="RESERVED_0" low="14" high="31" type="uint"/>
+ <bitfield name="DATA_ENTRIES" low="0" high="13" type="uint"/>
+ </reg32>
+ <reg32 offset="0x104C" name="CVT_CON0">
+ <bitfield name="RESERVED_0" low="28" high="31" type="uint"/>
+ <bitfield name="CVT_TRUNCATE_3" low="22" high="27" type="uint"/>
+ <bitfield name="CVT_TRUNCATE_2" low="16" high="21" type="uint"/>
+ <bitfield name="CVT_TRUNCATE_1" low="10" high="15" type="uint"/>
+ <bitfield name="CVT_TRUNCATE_0" low="4" high="9" type="uint"/>
+ <bitfield name="DATA_SIGN" pos="3" type="uint"/>
+ <bitfield name="ROUND_TYPE" pos="2" type="uint"/>
+ <bitfield name="CVT_TYPE" pos="1" type="uint"/>
+ <bitfield name="CVT_BYPASS" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1050" name="CVT_CON1">
+ <bitfield name="CVT_SCALE0" low="16" high="31" type="uint"/>
+ <bitfield name="CVT_OFFSET0" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1054" name="CVT_CON2">
+ <bitfield name="CVT_SCALE1" low="16" high="31" type="uint"/>
+ <bitfield name="CVT_OFFSET1" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1058" name="CVT_CON3">
+ <bitfield name="CVT_SCALE2" low="16" high="31" type="uint"/>
+ <bitfield name="CVT_OFFSET2" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x105C" name="CVT_CON4">
+ <bitfield name="CVT_SCALE3" low="16" high="31" type="uint"/>
+ <bitfield name="CVT_OFFSET3" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1060" name="FC_CON0">
+ <bitfield name="FC_SKIP_DATA" low="16" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="1" high="15" type="uint"/>
+ <bitfield name="FC_SKIP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1064" name="FC_CON1">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="DATA_OFFSET" low="0" high="16" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1068" name="PAD_CON0">
+ <bitfield name="RESERVED_0" low="8" high="31" type="uint"/>
+ <bitfield name="PAD_LEFT" low="4" high="7" type="uint"/>
+ <bitfield name="PAD_TOP" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1070" name="FEATURE_DATA_ADDR">
+ <bitfield name="FEATURE_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1074" name="FC_CON2">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="WEIGHT_OFFSET" low="0" high="16" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1078" name="DMA_CON0">
+ <bitfield name="OV4K_BYPASS" pos="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="20" high="30" type="uint"/>
+ <bitfield name="WEIGHT_BURST_LEN" low="16" high="19" type="uint"/>
+ <bitfield name="RESERVED_1" low="4" high="15" type="uint"/>
+ <bitfield name="DATA_BURST_LEN" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x107C" name="DMA_CON1">
+ <bitfield name="RESERVED_0" low="28" high="31" type="uint"/>
+ <bitfield name="LINE_STRIDE" low="0" high="27" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1080" name="DMA_CON2">
+ <bitfield name="RESERVED_0" low="28" high="31" type="uint"/>
+ <bitfield name="SURF_STRIDE" low="0" high="27" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1084" name="FC_DATA_SIZE0">
+ <bitfield name="RESERVED_0" low="30" high="31" type="uint"/>
+ <bitfield name="DMA_WIDTH" low="16" high="29" type="uint"/>
+ <bitfield name="RESERVED_1" low="11" high="15" type="uint"/>
+ <bitfield name="DMA_HEIGHT" low="0" high="10" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1088" name="FC_DATA_SIZE1">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="DMA_CHANNEL" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1090" name="CLK_GATE">
+ <bitfield name="RESERVED_0" low="5" high="31" type="uint"/>
+ <bitfield name="CBUF_CS_DISABLE_CLKGATE" pos="4" type="uint"/>
+ <bitfield name="RESERVED_1" pos="3" type="uint"/>
+ <bitfield name="CSC_DISABLE_CLKGATE" pos="2" type="uint"/>
+ <bitfield name="CNA_WEIGHT_DISABLE_CLKGATE" pos="1" type="uint"/>
+ <bitfield name="CNA_FEATURE_DISABLE_CLKGATE" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1100" name="DCOMP_CTRL">
+ <bitfield name="RESERVED_0" low="4" high="31" type="uint"/>
+ <bitfield name="WT_DEC_BYPASS" pos="3" type="uint"/>
+ <bitfield name="DECOMP_CONTROL" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1104" name="DCOMP_REGNUM">
+ <bitfield name="DCOMP_REGNUM" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1110" name="DCOMP_ADDR0">
+ <bitfield name="DECOMPRESS_ADDR0" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1140" name="DCOMP_AMOUNT0">
+ <bitfield name="DCOMP_AMOUNT0" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1144" name="DCOMP_AMOUNT1">
+ <bitfield name="DCOMP_AMOUNT1" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1148" name="DCOMP_AMOUNT2">
+ <bitfield name="DCOMP_AMOUNT2" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x114C" name="DCOMP_AMOUNT3">
+ <bitfield name="DCOMP_AMOUNT3" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1150" name="DCOMP_AMOUNT4">
+ <bitfield name="DCOMP_AMOUNT4" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1154" name="DCOMP_AMOUNT5">
+ <bitfield name="DCOMP_AMOUNT5" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1158" name="DCOMP_AMOUNT6">
+ <bitfield name="DCOMP_AMOUNT6" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x115C" name="DCOMP_AMOUNT7">
+ <bitfield name="DCOMP_AMOUNT7" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1160" name="DCOMP_AMOUNT8">
+ <bitfield name="DCOMP_AMOUNT8" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1164" name="DCOMP_AMOUNT9">
+ <bitfield name="DCOMP_AMOUNT9" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1168" name="DCOMP_AMOUNT10">
+ <bitfield name="DCOMP_AMOUNT10" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x116C" name="DCOMP_AMOUNT11">
+ <bitfield name="DCOMP_AMOUNT11" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1170" name="DCOMP_AMOUNT12">
+ <bitfield name="DCOMP_AMOUNT12" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1174" name="DCOMP_AMOUNT13">
+ <bitfield name="DCOMP_AMOUNT13" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1178" name="DCOMP_AMOUNT14">
+ <bitfield name="DCOMP_AMOUNT14" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x117C" name="DCOMP_AMOUNT15">
+ <bitfield name="DCOMP_AMOUNT15" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1180" name="CVT_CON5">
+ <bitfield name="PER_CHANNEL_CVT_EN" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x1184" name="PAD_CON1">
+ <bitfield name="PAD_VALUE" low="0" high="31" type="uint"/>
+ </reg32>
+</domain>
+<domain name="CORE" width="32">
+ <reg32 offset="0x3000" name="S_STATUS">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="STATUS_1" low="16" high="17" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="15" type="uint"/>
+ <bitfield name="STATUS_0" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x3004" name="S_POINTER">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="EXECUTER" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="6" high="15" type="uint"/>
+ <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/>
+ <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/>
+ <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/>
+ <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/>
+ <bitfield name="POINTER_PP_EN" pos="1" type="uint"/>
+ <bitfield name="POINTER" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x3008" name="OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x300C" name="MAC_GATING">
+ <bitfield name="RESERVED_0" low="27" high="31" type="uint"/>
+ <bitfield name="SLCG_OP_EN" low="0" high="26" type="uint"/>
+ </reg32>
+ <reg32 offset="0x3010" name="MISC_CFG">
+ <bitfield name="RESERVED_0" low="20" high="31" type="uint"/>
+ <bitfield name="SOFT_GATING" low="14" high="19" type="uint"/>
+ <bitfield name="RESERVED_1" low="11" high="13" type="uint"/>
+ <bitfield name="PROC_PRECISION" low="8" high="10" type="uint"/>
+ <bitfield name="RESERVED_2" low="2" high="7" type="uint"/>
+ <bitfield name="DW_EN" pos="1" type="uint"/>
+ <bitfield name="QD_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x3014" name="DATAOUT_SIZE_0">
+ <bitfield name="DATAOUT_HEIGHT" low="16" high="31" type="uint"/>
+ <bitfield name="DATAOUT_WIDTH" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x3018" name="DATAOUT_SIZE_1">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="DATAOUT_CHANNEL" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x301C" name="CLIP_TRUNCATE">
+ <bitfield name="RESERVED_0" low="7" high="31" type="uint"/>
+ <bitfield name="ROUND_TYPE" pos="6" type="uint"/>
+ <bitfield name="RESERVED_1" pos="5" type="uint"/>
+ <bitfield name="CLIP_TRUNCATE" low="0" high="4" type="uint"/>
+ </reg32>
+</domain>
+<domain name="DPU" width="32">
+ <reg32 offset="0x4000" name="S_STATUS">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="STATUS_1" low="16" high="17" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="15" type="uint"/>
+ <bitfield name="STATUS_0" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4004" name="S_POINTER">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="EXECUTER" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="6" high="15" type="uint"/>
+ <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/>
+ <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/>
+ <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/>
+ <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/>
+ <bitfield name="POINTER_PP_EN" pos="1" type="uint"/>
+ <bitfield name="POINTER" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4008" name="OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x400C" name="FEATURE_MODE_CFG">
+ <bitfield name="COMB_USE" pos="31" type="uint"/>
+ <bitfield name="TP_EN" pos="30" type="uint"/>
+ <bitfield name="RGP_TYPE" low="26" high="29" type="uint"/>
+ <bitfield name="NONALIGN" pos="25" type="uint"/>
+ <bitfield name="SURF_LEN" low="9" high="24" type="uint"/>
+ <bitfield name="BURST_LEN" low="5" high="8" type="uint"/>
+ <bitfield name="CONV_MODE" low="3" high="4" type="uint"/>
+ <bitfield name="OUTPUT_MODE" low="1" high="2" type="uint"/>
+ <bitfield name="FLYING_MODE" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4010" name="DATA_FORMAT">
+ <bitfield name="OUT_PRECISION" low="29" high="31" type="uint"/>
+ <bitfield name="IN_PRECISION" low="26" high="28" type="uint"/>
+ <bitfield name="EW_TRUNCATE_NEG" low="16" high="25" type="uint"/>
+ <bitfield name="BN_MUL_SHIFT_VALUE_NEG" low="10" high="15" type="uint"/>
+ <bitfield name="BS_MUL_SHIFT_VALUE_NEG" low="4" high="9" type="uint"/>
+ <bitfield name="MC_SURF_OUT" pos="3" type="uint"/>
+ <bitfield name="PROC_PRECISION" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4014" name="OFFSET_PEND">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="OFFSET_PEND" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4020" name="DST_BASE_ADDR">
+ <bitfield name="DST_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4024" name="DST_SURF_STRIDE">
+ <bitfield name="DST_SURF_STRIDE" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4030" name="DATA_CUBE_WIDTH">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="WIDTH" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4034" name="DATA_CUBE_HEIGHT">
+ <bitfield name="RESERVED_0" low="25" high="31" type="uint"/>
+ <bitfield name="MINMAX_CTL" low="22" high="24" type="uint"/>
+ <bitfield name="RESERVED_1" low="13" high="21" type="uint"/>
+ <bitfield name="HEIGHT" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4038" name="DATA_CUBE_NOTCH_ADDR">
+ <bitfield name="RESERVED_0" low="29" high="31" type="uint"/>
+ <bitfield name="NOTCH_ADDR_1" low="16" high="28" type="uint"/>
+ <bitfield name="RESERVED_1" low="13" high="15" type="uint"/>
+ <bitfield name="NOTCH_ADDR_0" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x403C" name="DATA_CUBE_CHANNEL">
+ <bitfield name="RESERVED_0" low="29" high="31" type="uint"/>
+ <bitfield name="ORIG_CHANNEL" low="16" high="28" type="uint"/>
+ <bitfield name="RESERVED_1" low="13" high="15" type="uint"/>
+ <bitfield name="CHANNEL" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4040" name="BS_CFG">
+ <bitfield name="RESERVED_0" low="20" high="31" type="uint"/>
+ <bitfield name="BS_ALU_ALGO" low="16" high="19" type="uint"/>
+ <bitfield name="RESERVED_1" low="9" high="15" type="uint"/>
+ <bitfield name="BS_ALU_SRC" pos="8" type="uint"/>
+ <bitfield name="BS_RELUX_EN" pos="7" type="uint"/>
+ <bitfield name="BS_RELU_BYPASS" pos="6" type="uint"/>
+ <bitfield name="BS_MUL_PRELU" pos="5" type="uint"/>
+ <bitfield name="BS_MUL_BYPASS" pos="4" type="uint"/>
+ <bitfield name="RESERVED_2" low="2" high="3" type="uint"/>
+ <bitfield name="BS_ALU_BYPASS" pos="1" type="uint"/>
+ <bitfield name="BS_BYPASS" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4044" name="BS_ALU_CFG">
+ <bitfield name="BS_ALU_OPERAND" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4048" name="BS_MUL_CFG">
+ <bitfield name="BS_MUL_OPERAND" low="16" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="14" high="15" type="uint"/>
+ <bitfield name="BS_MUL_SHIFT_VALUE" low="8" high="13" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="7" type="uint"/>
+ <bitfield name="BS_TRUNCATE_SRC" pos="1" type="uint"/>
+ <bitfield name="BS_MUL_SRC" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x404C" name="BS_RELUX_CMP_VALUE">
+ <bitfield name="BS_RELUX_CMP_DAT" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4050" name="BS_OW_CFG">
+ <bitfield name="RGP_CNTER" low="28" high="31" type="uint"/>
+ <bitfield name="TP_ORG_EN" pos="27" type="uint"/>
+ <bitfield name="RESERVED_0" low="11" high="26" type="uint"/>
+ <bitfield name="SIZE_E_2" low="8" high="10" type="uint"/>
+ <bitfield name="SIZE_E_1" low="5" high="7" type="uint"/>
+ <bitfield name="SIZE_E_0" low="2" high="4" type="uint"/>
+ <bitfield name="OD_BYPASS" pos="1" type="uint"/>
+ <bitfield name="OW_SRC" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4054" name="BS_OW_OP">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="OW_OP" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4058" name="WDMA_SIZE_0">
+ <bitfield name="RESERVED_0" low="28" high="31" type="uint"/>
+ <bitfield name="TP_PRECISION" pos="27" type="uint"/>
+ <bitfield name="SIZE_C_WDMA" low="16" high="26" type="uint"/>
+ <bitfield name="RESERVED_1" low="13" high="15" type="uint"/>
+ <bitfield name="CHANNEL_WDMA" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x405C" name="WDMA_SIZE_1">
+ <bitfield name="RESERVED_0" low="29" high="31" type="uint"/>
+ <bitfield name="HEIGHT_WDMA" low="16" high="28" type="uint"/>
+ <bitfield name="RESERVED_1" low="13" high="15" type="uint"/>
+ <bitfield name="WIDTH_WDMA" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4060" name="BN_CFG">
+ <bitfield name="RESERVED_0" low="20" high="31" type="uint"/>
+ <bitfield name="BN_ALU_ALGO" low="16" high="19" type="uint"/>
+ <bitfield name="RESERVED_1" low="9" high="15" type="uint"/>
+ <bitfield name="BN_ALU_SRC" pos="8" type="uint"/>
+ <bitfield name="BN_RELUX_EN" pos="7" type="uint"/>
+ <bitfield name="BN_RELU_BYPASS" pos="6" type="uint"/>
+ <bitfield name="BN_MUL_PRELU" pos="5" type="uint"/>
+ <bitfield name="BN_MUL_BYPASS" pos="4" type="uint"/>
+ <bitfield name="RESERVED_2" low="2" high="3" type="uint"/>
+ <bitfield name="BN_ALU_BYPASS" pos="1" type="uint"/>
+ <bitfield name="BN_BYPASS" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4064" name="BN_ALU_CFG">
+ <bitfield name="BN_ALU_OPERAND" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4068" name="BN_MUL_CFG">
+ <bitfield name="BN_MUL_OPERAND" low="16" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="14" high="15" type="uint"/>
+ <bitfield name="BN_MUL_SHIFT_VALUE" low="8" high="13" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="7" type="uint"/>
+ <bitfield name="BN_TRUNCATE_SRC" pos="1" type="uint"/>
+ <bitfield name="BN_MUL_SRC" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x406C" name="BN_RELUX_CMP_VALUE">
+ <bitfield name="BN_RELUX_CMP_DAT" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4070" name="EW_CFG">
+ <bitfield name="EW_CVT_TYPE" pos="31" type="uint"/>
+ <bitfield name="EW_CVT_ROUND" pos="30" type="uint"/>
+ <bitfield name="EW_DATA_MODE" low="28" high="29" type="uint"/>
+ <bitfield name="RESERVED_0" low="24" high="27" type="uint"/>
+ <bitfield name="EDATA_SIZE" low="22" high="23" type="uint"/>
+ <bitfield name="EW_EQUAL_EN" pos="21" type="uint"/>
+ <bitfield name="EW_BINARY_EN" pos="20" type="uint"/>
+ <bitfield name="EW_ALU_ALGO" low="16" high="19" type="uint"/>
+ <bitfield name="RESERVED_1" low="11" high="15" type="uint"/>
+ <bitfield name="EW_RELUX_EN" pos="10" type="uint"/>
+ <bitfield name="EW_RELU_BYPASS" pos="9" type="uint"/>
+ <bitfield name="EW_OP_CVT_BYPASS" pos="8" type="uint"/>
+ <bitfield name="EW_LUT_BYPASS" pos="7" type="uint"/>
+ <bitfield name="EW_OP_SRC" pos="6" type="uint"/>
+ <bitfield name="EW_MUL_PRELU" pos="5" type="uint"/>
+ <bitfield name="RESERVED_2" low="3" high="4" type="uint"/>
+ <bitfield name="EW_OP_TYPE" pos="2" type="uint"/>
+ <bitfield name="EW_OP_BYPASS" pos="1" type="uint"/>
+ <bitfield name="EW_BYPASS" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4074" name="EW_CVT_OFFSET_VALUE">
+ <bitfield name="EW_OP_CVT_OFFSET" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4078" name="EW_CVT_SCALE_VALUE">
+ <bitfield name="EW_TRUNCATE" low="22" high="31" type="uint"/>
+ <bitfield name="EW_OP_CVT_SHIFT" low="16" high="21" type="uint"/>
+ <bitfield name="EW_OP_CVT_SCALE" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x407C" name="EW_RELUX_CMP_VALUE">
+ <bitfield name="EW_RELUX_CMP_DAT" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4080" name="OUT_CVT_OFFSET">
+ <bitfield name="OUT_CVT_OFFSET" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4084" name="OUT_CVT_SCALE">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="FP32TOFP16_EN" pos="16" type="uint"/>
+ <bitfield name="OUT_CVT_SCALE" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4088" name="OUT_CVT_SHIFT">
+ <bitfield name="CVT_TYPE" pos="31" type="uint"/>
+ <bitfield name="CVT_ROUND" pos="30" type="uint"/>
+ <bitfield name="RESERVED_0" low="20" high="29" type="uint"/>
+ <bitfield name="MINUS_EXP" low="12" high="19" type="uint"/>
+ <bitfield name="OUT_CVT_SHIFT" low="0" high="11" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4090" name="EW_OP_VALUE_0">
+ <bitfield name="EW_OPERAND_0" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4094" name="EW_OP_VALUE_1">
+ <bitfield name="EW_OPERAND_1" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4098" name="EW_OP_VALUE_2">
+ <bitfield name="EW_OPERAND_2" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x409C" name="EW_OP_VALUE_3">
+ <bitfield name="EW_OPERAND_3" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x40A0" name="EW_OP_VALUE_4">
+ <bitfield name="EW_OPERAND_4" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x40A4" name="EW_OP_VALUE_5">
+ <bitfield name="EW_OPERAND_5" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x40A8" name="EW_OP_VALUE_6">
+ <bitfield name="EW_OPERAND_6" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x40AC" name="EW_OP_VALUE_7">
+ <bitfield name="EW_OPERAND_7" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x40C0" name="SURFACE_ADD">
+ <bitfield name="SURF_ADD" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4100" name="LUT_ACCESS_CFG">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="LUT_ACCESS_TYPE" pos="17" type="uint"/>
+ <bitfield name="LUT_TABLE_ID" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="10" high="15" type="uint"/>
+ <bitfield name="LUT_ADDR" low="0" high="9" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4104" name="LUT_ACCESS_DATA">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="LUT_ACCESS_DATA" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4108" name="LUT_CFG">
+ <bitfield name="RESERVED_0" low="8" high="31" type="uint"/>
+ <bitfield name="LUT_CAL_SEL" pos="7" type="uint"/>
+ <bitfield name="LUT_HYBRID_PRIORITY" pos="6" type="uint"/>
+ <bitfield name="LUT_OFLOW_PRIORITY" pos="5" type="uint"/>
+ <bitfield name="LUT_UFLOW_PRIORITY" pos="4" type="uint"/>
+ <bitfield name="LUT_LO_LE_MUX" low="2" high="3" type="uint"/>
+ <bitfield name="LUT_EXPAND_EN" pos="1" type="uint"/>
+ <bitfield name="LUT_ROAD_SEL" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x410C" name="LUT_INFO">
+ <bitfield name="RESERVED_0" low="24" high="31" type="uint"/>
+ <bitfield name="LUT_LO_INDEX_SELECT" low="16" high="23" type="uint"/>
+ <bitfield name="LUT_LE_INDEX_SELECT" low="8" high="15" type="uint"/>
+ <bitfield name="RESERVED_1" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4110" name="LUT_LE_START">
+ <bitfield name="LUT_LE_START" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4114" name="LUT_LE_END">
+ <bitfield name="LUT_LE_END" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4118" name="LUT_LO_START">
+ <bitfield name="LUT_LO_START" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x411C" name="LUT_LO_END">
+ <bitfield name="LUT_LO_END" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4120" name="LUT_LE_SLOPE_SCALE">
+ <bitfield name="LUT_LE_SLOPE_OFLOW_SCALE" low="16" high="31" type="uint"/>
+ <bitfield name="LUT_LE_SLOPE_UFLOW_SCALE" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4124" name="LUT_LE_SLOPE_SHIFT">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="LUT_LE_SLOPE_OFLOW_SHIFT" low="5" high="9" type="uint"/>
+ <bitfield name="LUT_LE_SLOPE_UFLOW_SHIFT" low="0" high="4" type="uint"/>
+ </reg32>
+ <reg32 offset="0x4128" name="LUT_LO_SLOPE_SCALE">
+ <bitfield name="LUT_LO_SLOPE_OFLOW_SCALE" low="16" high="31" type="uint"/>
+ <bitfield name="LUT_LO_SLOPE_UFLOW_SCALE" low="0" high="15" type="uint"/>
+ </reg32>
+ <reg32 offset="0x412C" name="LUT_LO_SLOPE_SHIFT">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="LUT_LO_SLOPE_OFLOW_SHIFT" low="5" high="9" type="uint"/>
+ <bitfield name="LUT_LO_SLOPE_UFLOW_SHIFT" low="0" high="4" type="uint"/>
+ </reg32>
+</domain>
+<domain name="DPU_RDMA" width="32">
+ <reg32 offset="0x5000" name="RDMA_S_STATUS">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="STATUS_1" low="16" high="17" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="15" type="uint"/>
+ <bitfield name="STATUS_0" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5004" name="RDMA_S_POINTER">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="EXECUTER" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="6" high="15" type="uint"/>
+ <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/>
+ <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/>
+ <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/>
+ <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/>
+ <bitfield name="POINTER_PP_EN" pos="1" type="uint"/>
+ <bitfield name="POINTER" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5008" name="RDMA_OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x500C" name="RDMA_DATA_CUBE_WIDTH">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="WIDTH" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5010" name="RDMA_DATA_CUBE_HEIGHT">
+ <bitfield name="RESERVED_0" low="29" high="31" type="uint"/>
+ <bitfield name="EW_LINE_NOTCH_ADDR" low="16" high="28" type="uint"/>
+ <bitfield name="RESERVED_1" low="13" high="15" type="uint"/>
+ <bitfield name="HEIGHT" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5014" name="RDMA_DATA_CUBE_CHANNEL">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CHANNEL" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5018" name="RDMA_SRC_BASE_ADDR">
+ <bitfield name="SRC_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x501C" name="RDMA_BRDMA_CFG">
+ <bitfield name="RESERVED_0" low="5" high="31" type="uint"/>
+ <bitfield name="BRDMA_DATA_USE" low="1" high="4" type="uint"/>
+ <bitfield name="RESERVED_1" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5020" name="RDMA_BS_BASE_ADDR">
+ <bitfield name="BS_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5028" name="RDMA_NRDMA_CFG">
+ <bitfield name="RESERVED_0" low="5" high="31" type="uint"/>
+ <bitfield name="NRDMA_DATA_USE" low="1" high="4" type="uint"/>
+ <bitfield name="RESERVED_1" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x502C" name="RDMA_BN_BASE_ADDR">
+ <bitfield name="BN_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5034" name="RDMA_ERDMA_CFG">
+ <bitfield name="ERDMA_DATA_MODE" low="30" high="31" type="uint"/>
+ <bitfield name="ERDMA_SURF_MODE" pos="29" type="uint"/>
+ <bitfield name="ERDMA_NONALIGN" pos="28" type="uint"/>
+ <bitfield name="RESERVED_0" low="4" high="27" type="uint"/>
+ <bitfield name="ERDMA_DATA_SIZE" low="2" high="3" type="uint"/>
+ <bitfield name="OV4K_BYPASS" pos="1" type="uint"/>
+ <bitfield name="ERDMA_DISABLE" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5038" name="RDMA_EW_BASE_ADDR">
+ <bitfield name="EW_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5040" name="RDMA_EW_SURF_STRIDE">
+ <bitfield name="EW_SURF_STRIDE" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5044" name="RDMA_FEATURE_MODE_CFG">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="IN_PRECISION" low="15" high="17" type="uint"/>
+ <bitfield name="BURST_LEN" low="11" high="14" type="uint"/>
+ <bitfield name="COMB_USE" low="8" high="10" type="uint"/>
+ <bitfield name="PROC_PRECISION" low="5" high="7" type="uint"/>
+ <bitfield name="MRDMA_DISABLE" pos="4" type="uint"/>
+ <bitfield name="MRDMA_FP16TOFP32_EN" pos="3" type="uint"/>
+ <bitfield name="CONV_MODE" low="1" high="2" type="uint"/>
+ <bitfield name="FLYING_MODE" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5048" name="RDMA_SRC_DMA_CFG">
+ <bitfield name="LINE_NOTCH_ADDR" low="19" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="14" high="18" type="uint"/>
+ <bitfield name="POOLING_METHOD" pos="13" type="uint"/>
+ <bitfield name="UNPOOLING_EN" pos="12" type="uint"/>
+ <bitfield name="KERNEL_STRIDE_HEIGHT" low="9" high="11" type="uint"/>
+ <bitfield name="KERNEL_STRIDE_WIDTH" low="6" high="8" type="uint"/>
+ <bitfield name="KERNEL_HEIGHT" low="3" high="5" type="uint"/>
+ <bitfield name="KERNEL_WIDTH" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x504C" name="RDMA_SURF_NOTCH">
+ <bitfield name="SURF_NOTCH_ADDR" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5064" name="RDMA_PAD_CFG">
+ <bitfield name="PAD_VALUE" low="16" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="7" high="15" type="uint"/>
+ <bitfield name="PAD_TOP" low="4" high="6" type="uint"/>
+ <bitfield name="RESERVED_1" pos="3" type="uint"/>
+ <bitfield name="PAD_LEFT" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x5068" name="RDMA_WEIGHT">
+ <bitfield name="E_WEIGHT" low="24" high="31" type="uint"/>
+ <bitfield name="N_WEIGHT" low="16" high="23" type="uint"/>
+ <bitfield name="B_WEIGHT" low="8" high="15" type="uint"/>
+ <bitfield name="M_WEIGHT" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x506C" name="RDMA_EW_SURF_NOTCH">
+ <bitfield name="EW_SURF_NOTCH" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+</domain>
+<domain name="PPU" width="32">
+ <reg32 offset="0x6000" name="S_STATUS">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="STATUS_1" low="16" high="17" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="15" type="uint"/>
+ <bitfield name="STATUS_0" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6004" name="S_POINTER">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="EXECUTER" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="6" high="15" type="uint"/>
+ <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/>
+ <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/>
+ <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/>
+ <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/>
+ <bitfield name="POINTER_PP_EN" pos="1" type="uint"/>
+ <bitfield name="POINTER" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6008" name="OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x600C" name="DATA_CUBE_IN_WIDTH">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_IN_WIDTH" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6010" name="DATA_CUBE_IN_HEIGHT">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_IN_HEIGHT" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6014" name="DATA_CUBE_IN_CHANNEL">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_IN_CHANNEL" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6018" name="DATA_CUBE_OUT_WIDTH">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_OUT_WIDTH" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x601C" name="DATA_CUBE_OUT_HEIGHT">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_OUT_HEIGHT" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6020" name="DATA_CUBE_OUT_CHANNEL">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_OUT_CHANNEL" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6024" name="OPERATION_MODE_CFG">
+ <bitfield name="RESERVED_0" pos="31" type="uint"/>
+ <bitfield name="INDEX_EN" pos="30" type="uint"/>
+ <bitfield name="RESERVED_1" pos="29" type="uint"/>
+ <bitfield name="NOTCH_ADDR" low="16" high="28" type="uint"/>
+ <bitfield name="RESERVED_2" low="8" high="15" type="uint"/>
+ <bitfield name="USE_CNT" low="5" high="7" type="uint"/>
+ <bitfield name="FLYING_MODE" pos="4" type="uint"/>
+ <bitfield name="RESERVED_3" low="2" high="3" type="uint"/>
+ <bitfield name="POOLING_METHOD" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6034" name="POOLING_KERNEL_CFG">
+ <bitfield name="RESERVED_0" low="24" high="31" type="uint"/>
+ <bitfield name="KERNEL_STRIDE_HEIGHT" low="20" high="23" type="uint"/>
+ <bitfield name="KERNEL_STRIDE_WIDTH" low="16" high="19" type="uint"/>
+ <bitfield name="RESERVED_1" low="12" high="15" type="uint"/>
+ <bitfield name="KERNEL_HEIGHT" low="8" high="11" type="uint"/>
+ <bitfield name="RESERVED_2" low="4" high="7" type="uint"/>
+ <bitfield name="KERNEL_WIDTH" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6038" name="RECIP_KERNEL_WIDTH">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="RECIP_KERNEL_WIDTH" low="0" high="16" type="uint"/>
+ </reg32>
+ <reg32 offset="0x603C" name="RECIP_KERNEL_HEIGHT">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="RECIP_KERNEL_HEIGHT" low="0" high="16" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6040" name="POOLING_PADDING_CFG">
+ <bitfield name="RESERVED_0" low="15" high="31" type="uint"/>
+ <bitfield name="PAD_BOTTOM" low="12" high="14" type="uint"/>
+ <bitfield name="RESERVED_1" pos="11" type="uint"/>
+ <bitfield name="PAD_RIGHT" low="8" high="10" type="uint"/>
+ <bitfield name="RESERVED_2" pos="7" type="uint"/>
+ <bitfield name="PAD_TOP" low="4" high="6" type="uint"/>
+ <bitfield name="RESERVED_3" pos="3" type="uint"/>
+ <bitfield name="PAD_LEFT" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6044" name="PADDING_VALUE_1_CFG">
+ <bitfield name="PAD_VALUE_0" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6048" name="PADDING_VALUE_2_CFG">
+ <bitfield name="RESERVED_0" low="3" high="31" type="uint"/>
+ <bitfield name="PAD_VALUE_1" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6070" name="DST_BASE_ADDR">
+ <bitfield name="DST_BASE_ADDR" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x607C" name="DST_SURF_STRIDE">
+ <bitfield name="DST_SURF_STRIDE" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x6084" name="DATA_FORMAT">
+ <bitfield name="INDEX_ADD" low="4" high="31" type="uint"/>
+ <bitfield name="DPU_FLYIN" pos="3" type="uint"/>
+ <bitfield name="PROC_PRECISION" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x60DC" name="MISC_CTRL">
+ <bitfield name="SURF_LEN" low="16" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="9" high="15" type="uint"/>
+ <bitfield name="MC_SURF_OUT" pos="8" type="uint"/>
+ <bitfield name="NONALIGN" pos="7" type="uint"/>
+ <bitfield name="RESERVED_1" low="4" high="6" type="uint"/>
+ <bitfield name="BURST_LEN" low="0" high="3" type="uint"/>
+ </reg32>
+</domain>
+<domain name="PPU_RDMA" width="32">
+ <reg32 offset="0x7000" name="RDMA_S_STATUS">
+ <bitfield name="RESERVED_0" low="18" high="31" type="uint"/>
+ <bitfield name="STATUS_1" low="16" high="17" type="uint"/>
+ <bitfield name="RESERVED_1" low="2" high="15" type="uint"/>
+ <bitfield name="STATUS_0" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7004" name="RDMA_S_POINTER">
+ <bitfield name="RESERVED_0" low="17" high="31" type="uint"/>
+ <bitfield name="EXECUTER" pos="16" type="uint"/>
+ <bitfield name="RESERVED_1" low="6" high="15" type="uint"/>
+ <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/>
+ <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/>
+ <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/>
+ <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/>
+ <bitfield name="POINTER_PP_EN" pos="1" type="uint"/>
+ <bitfield name="POINTER" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7008" name="RDMA_OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="OP_EN" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x700C" name="RDMA_CUBE_IN_WIDTH">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_IN_WIDTH" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7010" name="RDMA_CUBE_IN_HEIGHT">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_IN_HEIGHT" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7014" name="RDMA_CUBE_IN_CHANNEL">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="CUBE_IN_CHANNEL" low="0" high="12" type="uint"/>
+ </reg32>
+ <reg32 offset="0x701C" name="RDMA_SRC_BASE_ADDR">
+ <bitfield name="SRC_BASE_ADDR" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7024" name="RDMA_SRC_LINE_STRIDE">
+ <bitfield name="SRC_LINE_STRIDE" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7028" name="RDMA_SRC_SURF_STRIDE">
+ <bitfield name="SRC_SURF_STRIDE" low="4" high="31" type="uint"/>
+ <bitfield name="RESERVED_0" low="0" high="3" type="uint"/>
+ </reg32>
+ <reg32 offset="0x7030" name="RDMA_DATA_FORMAT">
+ <bitfield name="RESERVED_0" low="2" high="31" type="uint"/>
+ <bitfield name="IN_PRECISION" low="0" high="1" type="uint"/>
+ </reg32>
+</domain>
+<domain name="DDMA" width="32">
+ <reg32 offset="0x8000" name="CFG_OUTSTANDING">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="WR_OS_CNT" low="8" high="15" type="uint"/>
+ <bitfield name="RD_OS_CNT" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8004" name="RD_WEIGHT_0">
+ <bitfield name="RD_WEIGHT_PDP" low="24" high="31" type="uint"/>
+ <bitfield name="RD_WEIGHT_DPU" low="16" high="23" type="uint"/>
+ <bitfield name="RD_WEIGHT_KERNEL" low="8" high="15" type="uint"/>
+ <bitfield name="RD_WEIGHT_FEATURE" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8008" name="WR_WEIGHT_0">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="WR_WEIGHT_PDP" low="8" high="15" type="uint"/>
+ <bitfield name="WR_WEIGHT_DPU" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x800C" name="CFG_ID_ERROR">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="WR_RESP_ID" low="6" high="9" type="uint"/>
+ <bitfield name="RESERVED_1" pos="5" type="uint"/>
+ <bitfield name="RD_RESP_ID" low="0" high="4" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8010" name="RD_WEIGHT_1">
+ <bitfield name="RESERVED_0" low="8" high="31" type="uint"/>
+ <bitfield name="RD_WEIGHT_PC" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8014" name="CFG_DMA_FIFO_CLR">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="DMA_FIFO_CLR" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8018" name="CFG_DMA_ARB">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="WR_ARBIT_MODEL" pos="9" type="uint"/>
+ <bitfield name="RD_ARBIT_MODEL" pos="8" type="uint"/>
+ <bitfield name="RESERVED_1" pos="7" type="uint"/>
+ <bitfield name="WR_FIX_ARB" low="4" high="6" type="uint"/>
+ <bitfield name="RESERVED_2" pos="3" type="uint"/>
+ <bitfield name="RD_FIX_ARB" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8020" name="CFG_DMA_RD_QOS">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="RD_PC_QOS" low="8" high="9" type="uint"/>
+ <bitfield name="RD_PPU_QOS" low="6" high="7" type="uint"/>
+ <bitfield name="RD_DPU_QOS" low="4" high="5" type="uint"/>
+ <bitfield name="RD_KERNEL_QOS" low="2" high="3" type="uint"/>
+ <bitfield name="RD_FEATURE_QOS" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8024" name="CFG_DMA_RD_CFG">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="RD_ARLOCK" pos="12" type="uint"/>
+ <bitfield name="RD_ARCACHE" low="8" high="11" type="uint"/>
+ <bitfield name="RD_ARPROT" low="5" high="7" type="uint"/>
+ <bitfield name="RD_ARBURST" low="3" high="4" type="uint"/>
+ <bitfield name="RD_ARSIZE" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8028" name="CFG_DMA_WR_CFG">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="WR_AWLOCK" pos="12" type="uint"/>
+ <bitfield name="WR_AWCACHE" low="8" high="11" type="uint"/>
+ <bitfield name="WR_AWPROT" low="5" high="7" type="uint"/>
+ <bitfield name="WR_AWBURST" low="3" high="4" type="uint"/>
+ <bitfield name="WR_AWSIZE" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x802C" name="CFG_DMA_WSTRB">
+ <bitfield name="WR_WSTRB" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x8030" name="CFG_STATUS">
+ <bitfield name="RESERVED_0" low="9" high="31" type="uint"/>
+ <bitfield name="IDEL" pos="8" type="uint"/>
+ <bitfield name="RESERVED_1" low="0" high="7" type="uint"/>
+ </reg32>
+</domain>
+<domain name="SDMA" width="32">
+ <reg32 offset="0x9000" name="CFG_OUTSTANDING">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="WR_OS_CNT" low="8" high="15" type="uint"/>
+ <bitfield name="RD_OS_CNT" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9004" name="RD_WEIGHT_0">
+ <bitfield name="RD_WEIGHT_PDP" low="24" high="31" type="uint"/>
+ <bitfield name="RD_WEIGHT_DPU" low="16" high="23" type="uint"/>
+ <bitfield name="RD_WEIGHT_KERNEL" low="8" high="15" type="uint"/>
+ <bitfield name="RD_WEIGHT_FEATURE" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9008" name="WR_WEIGHT_0">
+ <bitfield name="RESERVED_0" low="16" high="31" type="uint"/>
+ <bitfield name="WR_WEIGHT_PDP" low="8" high="15" type="uint"/>
+ <bitfield name="WR_WEIGHT_DPU" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x900C" name="CFG_ID_ERROR">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="WR_RESP_ID" low="6" high="9" type="uint"/>
+ <bitfield name="RESERVED_1" pos="5" type="uint"/>
+ <bitfield name="RD_RESP_ID" low="0" high="4" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9010" name="RD_WEIGHT_1">
+ <bitfield name="RESERVED_0" low="8" high="31" type="uint"/>
+ <bitfield name="RD_WEIGHT_PC" low="0" high="7" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9014" name="CFG_DMA_FIFO_CLR">
+ <bitfield name="RESERVED_0" low="1" high="31" type="uint"/>
+ <bitfield name="DMA_FIFO_CLR" pos="0" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9018" name="CFG_DMA_ARB">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="WR_ARBIT_MODEL" pos="9" type="uint"/>
+ <bitfield name="RD_ARBIT_MODEL" pos="8" type="uint"/>
+ <bitfield name="RESERVED_1" pos="7" type="uint"/>
+ <bitfield name="WR_FIX_ARB" low="4" high="6" type="uint"/>
+ <bitfield name="RESERVED_2" pos="3" type="uint"/>
+ <bitfield name="RD_FIX_ARB" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9020" name="CFG_DMA_RD_QOS">
+ <bitfield name="RESERVED_0" low="10" high="31" type="uint"/>
+ <bitfield name="RD_PC_QOS" low="8" high="9" type="uint"/>
+ <bitfield name="RD_PPU_QOS" low="6" high="7" type="uint"/>
+ <bitfield name="RD_DPU_QOS" low="4" high="5" type="uint"/>
+ <bitfield name="RD_KERNEL_QOS" low="2" high="3" type="uint"/>
+ <bitfield name="RD_FEATURE_QOS" low="0" high="1" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9024" name="CFG_DMA_RD_CFG">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="RD_ARLOCK" pos="12" type="uint"/>
+ <bitfield name="RD_ARCACHE" low="8" high="11" type="uint"/>
+ <bitfield name="RD_ARPROT" low="5" high="7" type="uint"/>
+ <bitfield name="RD_ARBURST" low="3" high="4" type="uint"/>
+ <bitfield name="RD_ARSIZE" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9028" name="CFG_DMA_WR_CFG">
+ <bitfield name="RESERVED_0" low="13" high="31" type="uint"/>
+ <bitfield name="WR_AWLOCK" pos="12" type="uint"/>
+ <bitfield name="WR_AWCACHE" low="8" high="11" type="uint"/>
+ <bitfield name="WR_AWPROT" low="5" high="7" type="uint"/>
+ <bitfield name="WR_AWBURST" low="3" high="4" type="uint"/>
+ <bitfield name="WR_AWSIZE" low="0" high="2" type="uint"/>
+ </reg32>
+ <reg32 offset="0x902C" name="CFG_DMA_WSTRB">
+ <bitfield name="WR_WSTRB" low="0" high="31" type="uint"/>
+ </reg32>
+ <reg32 offset="0x9030" name="CFG_STATUS">
+ <bitfield name="RESERVED_0" low="9" high="31" type="uint"/>
+ <bitfield name="IDEL" pos="8" type="uint"/>
+ <bitfield name="RESERVED_1" low="0" high="7" type="uint"/>
+ </reg32>
+</domain>
+<domain name="GLOBAL" width="32">
+ <reg32 offset="0xF008" name="OPERATION_ENABLE">
+ <bitfield name="RESERVED_0" low="7" high="31" type="uint"/>
+ <bitfield name="PPU_RDMA_OP_EN" pos="6" type="uint"/>
+ <bitfield name="PPU_OP_EN" pos="5" type="uint"/>
+ <bitfield name="DPU_RDMA_OP_EN" pos="4" type="uint"/>
+ <bitfield name="DPU_OP_EN" pos="3" type="uint"/>
+ <bitfield name="CORE_OP_EN" pos="2" type="uint"/>
+ <bitfield name="RESERVED_1" pos="1" type="uint"/>
+ <bitfield name="CNA_OP_EN" pos="0" type="uint"/>
+ </reg32>
+</domain>
+
+</database>
diff --git a/src/gallium/drivers/rocket/rkt_coefs.c b/src/gallium/drivers/rocket/rkt_coefs.c
new file mode 100644
index 00000000000..82258e70aaa
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_coefs.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/u_inlines.h"
+
+#include "rkt_coefs.h"
+#include "rkt_ml.h"
+
+struct pipe_resource *
+rkt_fill_weights(struct rkt_ml_subgraph *subgraph,
+ const struct pipe_ml_operation *poperation)
+{
+ struct pipe_context *pcontext = subgraph->base.context;
+ unsigned weights_width = poperation->conv.weight_tensor->dims[1];
+ unsigned weights_height = poperation->conv.weight_tensor->dims[2];
+ unsigned input_channels = poperation->input_tensors[0]->dims[3];
+ unsigned input_channels_real = poperation->input_tensors[0]->dims[3];
+ unsigned output_channels = poperation->output_tensors[0]->dims[3];
+ unsigned output_channels_real = poperation->output_tensors[0]->dims[3];
+ unsigned weights_size;
+ uint8_t zero_point = poperation->conv.weight_tensor->zero_point;
+ struct pipe_transfer *transfer_in, *transfer_out;
+ void *map =
+ pipe_buffer_map(pcontext, poperation->conv.weight_tensor->resource,
+ PIPE_MAP_READ, &transfer_in);
+ uint8_t(*weights_in)[weights_width][weights_height][input_channels] = map;
+ struct pipe_resource *rsc;
+ uint8_t *weights_out;
+
+ input_channels = MAX2(input_channels, FEATURE_ATOMIC_SIZE);
+
+ output_channels = ALIGN(output_channels, 2);
+ if (rkt_is_depthwise(poperation))
+ output_channels = 1;
+
+ weights_size = weights_width * weights_height * output_channels *
+ ALIGN(input_channels, WEIGHT_ATOMIC_SIZE) * 2;
+
+ rsc =
+ pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT, weights_size);
+ weights_out = pipe_buffer_map(pcontext, rsc, PIPE_MAP_WRITE, &transfer_out);
+
+ unsigned input_channel_groups = WEIGHT_ATOMIC_SIZE;
+ if (rkt_is_depthwise(poperation))
+ input_channel_groups *= 2;
+
+ unsigned input_channels_1 =
+ DIV_ROUND_UP(input_channels, input_channel_groups);
+ unsigned input_channels_2 = MIN2(input_channels, input_channel_groups);
+
+ unsigned n = 0;
+ for (int oc1 = 0; oc1 < DIV_ROUND_UP(output_channels, WEIGHT_ATOMIC_SIZE);
+ oc1++) {
+ for (int ic1 = 0; ic1 < input_channels_1; ic1++) {
+ for (int x = 0; x < weights_width; x++) {
+ for (int y = 0; y < weights_height; y++) {
+ for (int oc2 = 0; oc2 < MIN2(output_channels, WEIGHT_ATOMIC_SIZE);
+ oc2++) {
+ for (int ic2 = 0; ic2 < input_channels_2; ic2++) {
+ unsigned oc = oc1 * WEIGHT_ATOMIC_SIZE + oc2;
+ unsigned ic = ic1 * input_channel_groups + ic2;
+ if (output_channels_real > 2 &&
+ oc >= ALIGN(output_channels_real, 2))
+ continue;
+
+ if (oc >= output_channels_real)
+ weights_out[n++] = 0x0;
+ else if (ic >= input_channels_real) {
+ if (ic2 < 16 || (input_channels_real % 32) > 16)
+ weights_out[n++] =
+ zero_point - 0x80; /* TODO: Why is the blob converting to
+ signed? It should be unsigned. */
+ } else
+ weights_out[n++] = weights_in[oc][x][y][ic] -
+ 0x80; /* TODO: Why is the blob converting to
+ signed? It should be unsigned. */
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) {
+ static int task = 0;
+ rkt_dump_buffer(weights_out, "weights", 0, task++, 0, weights_size);
+ }
+
+ pipe_buffer_unmap(pcontext, transfer_out);
+
+ pipe_buffer_unmap(pcontext, transfer_in);
+
+ return rsc;
+}
+
+static int32_t
+calculate_bias_correction(struct rkt_ml_subgraph *subgraph,
+ const struct pipe_ml_operation *poperation,
+ unsigned oc, void *map)
+{
+ unsigned input_channels = poperation->input_tensors[0]->dims[3];
+ unsigned input_zero_point = poperation->input_tensors[0]->zero_point;
+ unsigned weights_width = poperation->conv.weight_tensor->dims[1];
+ unsigned weights_height = poperation->conv.weight_tensor->dims[2];
+ unsigned weight_zero_point = poperation->conv.weight_tensor->zero_point;
+ uint8_t(*weights)[weights_width][weights_height][input_channels] = map;
+
+ int32_t correction = 0;
+ if (rkt_is_depthwise(poperation)) {
+ for (unsigned x = 0; x < weights_width; x++) {
+ for (unsigned y = 0; y < weights_height; y++) {
+ correction += (weights[0][x][y][oc] - weight_zero_point) *
+ (input_zero_point - 0x80);
+ }
+ }
+ } else {
+ for (unsigned x = 0; x < weights_width; x++) {
+ for (unsigned y = 0; y < weights_height; y++) {
+ for (unsigned ic = 0; ic < input_channels; ic++) {
+ correction += (weights[oc][x][y][ic] - weight_zero_point) *
+ (input_zero_point - 0x80);
+ }
+ }
+ }
+ }
+
+ return correction;
+}
+
+struct pipe_resource *
+rkt_fill_biases(struct rkt_ml_subgraph *subgraph,
+ const struct pipe_ml_operation *poperation,
+ unsigned *truncate_bits)
+{
+ struct pipe_context *pcontext = subgraph->base.context;
+ unsigned output_channels = poperation->output_tensors[0]->dims[3];
+ unsigned weights_size = poperation->conv.weight_tensor->dims[1];
+ struct pipe_transfer *transfer_in, *transfer_out, *transfer_weights;
+ int32_t *biases_in =
+ pipe_buffer_map(pcontext, poperation->conv.bias_tensor->resource,
+ PIPE_MAP_READ, &transfer_in);
+ void *weights =
+ pipe_buffer_map(pcontext, poperation->conv.weight_tensor->resource,
+ PIPE_MAP_READ, &transfer_weights);
+ struct pipe_resource *rsc;
+ uint32_t *biases;
+
+ rsc = pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT,
+ output_channels * sizeof(uint32_t));
+ biases = pipe_buffer_map(pcontext, rsc, PIPE_MAP_WRITE, &transfer_out);
+
+ // DBG("weight_scale %x\n",
+ // fui(poperation->conv.weight_tensor->scale));
+ /* TODO: Figure out when exactly we need to truncate */
+ /* From
+ * http://nvdla.org/hw/v1/ias/unit_description.html#convolution-accumulator :
+ *
+ * The final result of accumulator in CACC is 48bits for INT16 and 34bits for
+ * INT8. The bit width between CACC and SDP is 32. For precisions INT8 and
+ * INT16, there is a round and saturation operation before sending the result
+ * to SDP. The precision of rounding is configured by field CLIP_TRUNCATE in
+ * register D_CLIP_CFG. For FP16, the value is just converted from FP48 to
+ * FP32.
+ */
+ if (fui(poperation->conv.weight_tensor->scale) == 0x3a88323f ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c0060de ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c06022d ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c1642e3 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c1e3f51 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c5c8aa8 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c615e93 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c7326a2 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3c783013 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3d1748e6 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3d282992 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3d2e87ae ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3d77f5f6 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3a9a5956 ||
+ fui(poperation->conv.weight_tensor->scale) == 0x3caebc56)
+ *truncate_bits = 1;
+ else
+ *truncate_bits = 0;
+
+ int32_t max_bias = 0;
+ int32_t max_corr = 0;
+ unsigned max_num_bits = 0;
+ bool retry = true;
+ while (retry) {
+ for (int oc = 0; oc < output_channels; oc++) {
+ int32_t corr =
+ calculate_bias_correction(subgraph, poperation, oc, weights);
+ biases[oc] = (biases_in[oc] - corr) / (1 << *truncate_bits);
+
+ int64_t max_val =
+ (biases_in[oc] - corr + 255 * 255 * weights_size * weights_size) /
+ (1 << *truncate_bits);
+ unsigned num_bits = ceil(log(abs((int32_t)max_val)) / log(2)) + 1;
+ max_bias = MAX2(max_bias, biases[oc]);
+ max_corr = MAX2(max_corr, corr);
+ max_num_bits = MAX2(max_num_bits, num_bits);
+
+ /* TODO: This doesn't actually work, num_bits doesn't go above 19, and the
+ * blob sometimes truncates way below */
+ if (num_bits > 32) {
+ (*truncate_bits)++;
+ retry = true;
+ } else
+ retry = false;
+ }
+ }
+
+ if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) {
+ static int task = 0;
+ rkt_dump_buffer((uint8_t *)biases, "biases", 0, task++, 0,
+ output_channels * sizeof(uint32_t));
+ }
+
+ pipe_buffer_unmap(pcontext, transfer_out);
+
+ pipe_buffer_unmap(pcontext, transfer_weights);
+
+ pipe_buffer_unmap(pcontext, transfer_in);
+
+ return rsc;
+}
diff --git a/src/gallium/drivers/rocket/rkt_coefs.h b/src/gallium/drivers/rocket/rkt_coefs.h
new file mode 100644
index 00000000000..d670cecfe3d
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_coefs.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_COEFS_H
+#define RKT_COEFS_H
+
+#include "rkt_ml.h"
+
+struct pipe_resource *
+rkt_fill_weights(struct rkt_ml_subgraph *subgraph,
+ const struct pipe_ml_operation *poperation);
+
+struct pipe_resource *
+rkt_fill_biases(struct rkt_ml_subgraph *subgraph,
+ const struct pipe_ml_operation *poperation,
+ unsigned *truncate_bits);
+
+#endif /* RKT_COEFS_H */ \ No newline at end of file
diff --git a/src/gallium/drivers/rocket/rkt_device.c b/src/gallium/drivers/rocket/rkt_device.c
new file mode 100644
index 00000000000..9c2da6a2cd2
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_device.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "rkt_device.h"
+#include "rkt_ml.h"
+
+#include "drm-uapi/rocket_accel.h"
+
+#include <xf86drm.h>
+#include "util/os_mman.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/u_transfer.h"
+
+static const struct debug_named_value rocket_debug_options[] = {
+ {"dbg_msgs", ROCKET_DBG_MSGS, "Print debug messages"},
+ {"dump_bos", ROCKET_DBG_DUMP_BOS, "Dump buffers for analysis"},
+ {"zero_bos", ROCKET_DBG_ZERO, "Zero buffers for debugging"},
+ DEBUG_NAMED_VALUE_END};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(rocket_debug, "ROCKET_DEBUG", rocket_debug_options, 0)
+int rocket_debug = 0;
+
+static void
+rkt_destroy_screen(struct pipe_screen *pscreen)
+{
+ struct rkt_screen *screen = rkt_screen(pscreen);
+
+ if (screen->ro)
+ screen->ro->destroy(screen->ro);
+
+ ralloc_free(screen);
+}
+
+static void
+rkt_destroy_context(struct pipe_context *pctx)
+{
+ struct rkt_context *ctx = rkt_context(pctx);
+
+ ralloc_free(ctx);
+}
+
+static void *
+rkt_buffer_map(struct pipe_context *pctx,
+ struct pipe_resource *prsc, unsigned level,
+ unsigned usage, const struct pipe_box *box,
+ struct pipe_transfer **out_transfer)
+{
+ struct rkt_screen *screen = rkt_screen(pctx->screen);
+ struct rkt_resource *rsc = rkt_resource(prsc);
+ struct drm_rocket_prep_bo arg = {0};
+ int ret;
+
+ assert(level == 0);
+ assert(prsc->target == PIPE_BUFFER);
+ assert(box->y == 0);
+ assert(box->z == 0);
+ assert(box->height == 1);
+ assert(box->depth == 1);
+
+ struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer);
+ transfer->level = level;
+ transfer->usage = usage;
+ transfer->box = *box;
+
+ pipe_resource_reference(&transfer->resource, prsc);
+
+ arg.handle = rsc->handle;
+ arg.timeout_ns = INT64_MAX;
+
+ ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_PREP_BO, &arg);
+ assert(ret != -1);
+
+ uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED,
+ screen->fd, rsc->fake_offset);
+ assert(map != MAP_FAILED);
+
+ *out_transfer = transfer;
+
+ return map + box->x;
+}
+
+static void
+rkt_buffer_unmap(struct pipe_context *pctx,
+ struct pipe_transfer *transfer)
+{
+ struct rkt_screen *screen = rkt_screen(pctx->screen);
+ struct rkt_resource *rsrc = rkt_resource(transfer->resource);
+ struct drm_rocket_fini_bo arg = {0};
+ int ret;
+
+ arg.handle = rsrc->handle;
+
+ if (transfer->usage == PIPE_MAP_WRITE) {
+ ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_FINI_BO, &arg);
+ assert(ret >= 0);
+ }
+
+ pipe_resource_reference(&transfer->resource, NULL);
+ ralloc_free(transfer);
+}
+
+static struct pipe_context *
+rkt_create_context(struct pipe_screen *screen,
+ void *priv, unsigned flags)
+{
+ struct rkt_context *ctx = rzalloc(NULL, struct rkt_context);
+ struct pipe_context *pctx = &ctx->base;
+
+ if (!ctx)
+ return NULL;
+
+ pctx->screen = screen;
+ pctx->priv = priv;
+
+ pctx->destroy = rkt_destroy_context;
+
+ pctx->buffer_map = rkt_buffer_map;
+ pctx->buffer_unmap = rkt_buffer_unmap;
+ pctx->resource_copy_region = util_resource_copy_region;
+ pctx->buffer_subdata = u_default_buffer_subdata;
+ pctx->clear_buffer = u_default_clear_buffer;
+
+ pctx->ml_operation_supported = rkt_ml_operation_supported;
+ pctx->ml_subgraph_create = rkt_ml_subgraph_create;
+ pctx->ml_subgraph_invoke = rkt_ml_subgraph_invoke;
+ pctx->ml_subgraph_read_output = rkt_ml_subgraph_read_outputs;
+ pctx->ml_subgraph_destroy = rkt_ml_subgraph_destroy;
+
+ return pctx;
+}
+
+static struct pipe_resource *
+rkt_resource_create(struct pipe_screen *pscreen,
+ const struct pipe_resource *templat)
+{
+ struct rkt_screen *screen = rkt_screen(pscreen);
+ struct drm_rocket_create_bo arg = {0};
+ struct rkt_resource *rsc;
+ int ret;
+
+ assert(templat->target == PIPE_BUFFER);
+ assert(templat->height0 == 1);
+ assert(templat->depth0 == 1);
+ assert(templat->array_size == 1);
+
+ rsc = rzalloc(NULL, struct rkt_resource);
+ if (!rsc)
+ return NULL;
+
+ rsc->base = *templat;
+ rsc->base.screen = pscreen;
+ rsc->base.nr_samples = templat->nr_samples;
+ pipe_reference_init(&rsc->base.reference, 1);
+
+ rsc->bo_size = templat->width0;
+
+ arg.size = templat->width0;
+
+ ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_CREATE_BO, &arg);
+ if (ret < 0)
+ goto free_rsc;
+
+ rsc->handle = arg.handle;
+ rsc->phys_addr = arg.dma_address;
+ rsc->fake_offset = arg.offset;
+
+ if (DBG_ENABLED(ROCKET_DBG_ZERO)) {
+ void *map = os_mmap(NULL, arg.size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ screen->fd, rsc->fake_offset);
+ memset(map, 0, arg.size);
+ }
+
+ return &rsc->base;
+
+free_rsc:
+ ralloc_free(rsc);
+ return NULL;
+}
+
+static void
+rkt_resource_destroy(struct pipe_screen *pscreen,
+ struct pipe_resource *prsc)
+{
+ struct rkt_resource *rsc = rkt_resource(prsc);
+ struct rkt_screen *screen = rkt_screen(pscreen);
+ struct drm_gem_close arg = {0};
+ int ret;
+
+ arg.handle = rsc->handle;
+
+ ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
+ assert(ret >= 0);
+
+ ralloc_free(rsc);
+}
+
+static int
+rkt_screen_get_fd(struct pipe_screen *pscreen)
+{
+ return rkt_screen(pscreen)->fd;
+}
+
+struct pipe_screen *
+rkt_screen_create(int fd,
+ const struct pipe_screen_config *config,
+ struct renderonly *ro)
+{
+ struct rkt_screen *rkt_screen;
+ struct pipe_screen *screen;
+
+ rkt_screen = rzalloc(NULL, struct rkt_screen);
+ if (!rkt_screen)
+ return NULL;
+
+ screen = &rkt_screen->pscreen;
+
+ rocket_debug = debug_get_option_rocket_debug();
+
+ rkt_screen->fd = fd;
+
+ screen->get_screen_fd = rkt_screen_get_fd;
+ screen->destroy = rkt_destroy_screen;
+ screen->context_create = rkt_create_context;
+ screen->resource_create = rkt_resource_create;
+ screen->resource_destroy = rkt_resource_destroy;
+
+ return screen;
+} \ No newline at end of file
diff --git a/src/gallium/drivers/rocket/rkt_device.h b/src/gallium/drivers/rocket/rkt_device.h
new file mode 100644
index 00000000000..0425a4260d9
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_device.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "renderonly/renderonly.h"
+#include "util/log.h"
+
+#ifndef RKT_SCREEN_H
+#define RKT_SCREEN_H
+
+enum rkt_dbg {
+ ROCKET_DBG_MSGS = BITFIELD_BIT(0),
+ ROCKET_DBG_DUMP_BOS = BITFIELD_BIT(1),
+ ROCKET_DBG_ZERO = BITFIELD_BIT(2),
+};
+
+extern int rocket_debug;
+
+#define DBG_ENABLED(flag) unlikely(rocket_debug &(flag))
+
+#define DBG(fmt, ...) \
+ do { \
+ if (DBG_ENABLED(ROCKET_DBG_MSGS)) \
+ mesa_logd("%s:%d: " fmt, __func__, __LINE__, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+struct rkt_screen {
+ struct pipe_screen pscreen;
+
+ int fd;
+ struct renderonly *ro;
+};
+
+static inline struct rkt_screen *
+rkt_screen(struct pipe_screen *p)
+{
+ return (struct rkt_screen *)p;
+}
+
+struct rkt_context {
+ struct pipe_context base;
+};
+
+static inline struct rkt_context *
+rkt_context(struct pipe_context *pctx)
+{
+ return (struct rkt_context *)pctx;
+}
+
+struct rkt_resource {
+ struct pipe_resource base;
+
+ uint32_t handle;
+ uint64_t phys_addr;
+ uint64_t obj_addr;
+ uint64_t fake_offset;
+ uint64_t bo_size;
+};
+
+static inline struct rkt_resource *
+rkt_resource(struct pipe_resource *p)
+{
+ return (struct rkt_resource *)p;
+}
+
+struct pipe_screen *rkt_screen_create(int fd,
+ const struct pipe_screen_config *config,
+ struct renderonly *ro);
+
+#endif /* RKT_SCREEN_H */
diff --git a/src/gallium/drivers/rocket/rkt_ml.c b/src/gallium/drivers/rocket/rkt_ml.c
new file mode 100644
index 00000000000..129f76f7a43
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_ml.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_state.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "util/u_inlines.h"
+
+#include <xf86drm.h>
+
+#include "drm-uapi/rocket_accel.h"
+
+#include "rkt_coefs.h"
+#include "rkt_ml.h"
+#include "rkt_regcmd.h"
+#include "rkt_task.h"
+
+void
+rkt_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+ int suboperation_nr, int offset, unsigned size)
+{
+ char buffer[255];
+
+ snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr,
+ suboperation_nr);
+
+ FILE *f = fopen(buffer, "wb");
+ assert(f);
+ fwrite(ptr + offset, 1, size, f);
+ if (ferror(f)) {
+ DBG("Error in writing to file: %s\n", strerror(errno));
+ }
+ fflush(f);
+ fclose(f);
+}
+
+static void
+create_tensor(struct rkt_ml_subgraph *subgraph, unsigned idx,
+ unsigned size)
+{
+ struct pipe_context *context = subgraph->base.context;
+ struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
+
+ assert(idx < util_dynarray_num_elements(&subgraph->tensors,
+ struct pipe_resource *));
+
+ struct pipe_resource *res = tensors[idx];
+
+ if (res != NULL) {
+ assert(size == pipe_buffer_size(res));
+ return;
+ }
+
+ res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, size);
+ tensors[idx] = res;
+}
+
+struct rkt_resource *
+rkt_get_tensor(struct rkt_ml_subgraph *subgraph,
+ unsigned idx)
+{
+ return rkt_resource(
+ *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx));
+}
+
+bool
+rkt_is_depthwise(const struct pipe_ml_operation *poperation)
+{
+ unsigned input_channels = poperation->input_tensors[0]->dims[3];
+ unsigned output_channels = poperation->output_tensors[0]->dims[3];
+
+ return poperation->conv.depthwise && input_channels > 1 &&
+ output_channels > 1;
+}
+
+static unsigned
+calc_raw_output_size(struct rkt_operation *operation)
+{
+ unsigned output_channels_1 =
+ DIV_ROUND_UP(operation->output_channels, FEATURE_ATOMIC_SIZE) * 2;
+ unsigned output_channels_2 = FEATURE_ATOMIC_SIZE;
+
+ return operation->output_width * operation->output_height *
+ output_channels_1 * output_channels_2;
+}
+
+static void
+compile_operation(struct rkt_ml_subgraph *subgraph,
+ struct rkt_operation *operation)
+{
+ struct pipe_context *pcontext = subgraph->base.context;
+ unsigned regcfg_total_size = 0;
+ struct util_dynarray *regcfgs;
+ struct pipe_transfer *transfer = NULL;
+ unsigned num_tasks =
+ util_dynarray_num_elements(&operation->tasks, struct split_task);
+
+ regcfgs = calloc(num_tasks, sizeof(struct util_dynarray));
+
+ for (int i = 0; i < num_tasks; i++) {
+ util_dynarray_init(&regcfgs[i], NULL);
+ rkt_fill_regcmd(subgraph, operation, &regcfgs[i], i);
+
+ unsigned size =
+ util_dynarray_num_elements(&regcfgs[i], uint64_t) * sizeof(uint64_t);
+ regcfg_total_size += ALIGN(size, 64);
+ }
+
+ operation->regcmd = pipe_buffer_create(pcontext->screen, 0,
+ PIPE_USAGE_DEFAULT, regcfg_total_size);
+ uint8_t *regcmd =
+ pipe_buffer_map(pcontext, operation->regcmd, PIPE_MAP_WRITE, &transfer);
+
+ unsigned regcmd_offset = 0;
+ for (int i = 0; i < num_tasks; i++) {
+ unsigned size = util_dynarray_num_elements(&regcfgs[i], uint64_t);
+ struct split_task *task =
+ util_dynarray_element(&operation->tasks, struct split_task, i);
+
+ if (i < num_tasks - 1) {
+ /* Patch next address and amount of regs to fetch, positions are relative
+ * to end */
+ unsigned reg_count = util_dynarray_num_elements(&regcfgs[i], uint64_t);
+ uint64_t *next_address_reg =
+ util_dynarray_element(&regcfgs[i], uint64_t, reg_count - 4);
+ uint64_t *reg_count_reg =
+ util_dynarray_element(&regcfgs[i], uint64_t, reg_count - 3);
+
+ uint64_t addr = rkt_resource(operation->regcmd)->phys_addr +
+ regcmd_offset + ALIGN(size * sizeof(uint64_t), 64);
+ *next_address_reg |= addr << 16;
+
+ unsigned regs_to_fetch =
+ util_dynarray_num_elements(&regcfgs[i + 1], uint64_t);
+ regs_to_fetch -= 4;
+ regs_to_fetch = ALIGN(regs_to_fetch / 2, 2);
+ *reg_count_reg |= regs_to_fetch << 16;
+ }
+
+ memcpy(regcmd + regcmd_offset, util_dynarray_begin(&regcfgs[i]),
+ size * sizeof(uint64_t));
+ util_dynarray_fini(&regcfgs[i]);
+
+ task->regcfg_amount = size;
+ task->regcfg_addr =
+ rkt_resource(operation->regcmd)->phys_addr + regcmd_offset;
+
+ if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS))
+ rkt_dump_buffer(regcmd, "regcmd", 0, i, regcmd_offset,
+ (size + 4) * sizeof(uint64_t));
+
+ regcmd_offset += ALIGN(size * sizeof(uint64_t), 64);
+ }
+
+ pipe_buffer_unmap(pcontext, transfer);
+
+ for (int i = 0; i < num_tasks; i++) {
+ util_dynarray_fini(&regcfgs[i]);
+ }
+
+ free(regcfgs);
+}
+
+static void
+lower_convolution(struct rkt_ml_subgraph *subgraph,
+ const struct pipe_ml_operation *poperation,
+ struct rkt_operation *operation)
+{
+ util_dynarray_init(&operation->tasks, NULL);
+
+ operation->depthwise = rkt_is_depthwise(poperation);
+ operation->padding_same = poperation->conv.padding_same;
+ operation->stride = poperation->conv.stride_x;
+
+ operation->input_index = poperation->input_tensors[0]->index;
+ operation->input_width = poperation->input_tensors[0]->dims[1];
+ operation->input_height = poperation->input_tensors[0]->dims[2];
+ operation->input_channels = poperation->input_tensors[0]->dims[3];
+ operation->input_zero_point = poperation->input_tensors[0]->zero_point;
+ operation->input_scale = poperation->input_tensors[0]->scale;
+
+ operation->output_index = poperation->output_tensors[0]->index;
+ operation->output_width = poperation->output_tensors[0]->dims[1];
+ operation->output_height = poperation->output_tensors[0]->dims[2];
+ operation->output_channels = poperation->output_tensors[0]->dims[3];
+ operation->output_zero_point = poperation->output_tensors[0]->zero_point;
+ operation->output_scale = poperation->output_tensors[0]->scale;
+
+ operation->weights_width = poperation->conv.weight_tensor->dims[1];
+ operation->weights_height = poperation->conv.weight_tensor->dims[2];
+ operation->weights_zero_point = poperation->conv.weight_tensor->zero_point;
+ operation->weights_scale = poperation->conv.weight_tensor->scale;
+
+ operation->weights = rkt_fill_weights(subgraph, poperation);
+ operation->biases =
+ rkt_fill_biases(subgraph, poperation, &operation->truncate_bits);
+}
+
+static struct rkt_operation *
+find_first_consumer(struct rkt_ml_subgraph *subgraph, unsigned tensor_index)
+{
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+ operation) {
+ if (operation->input_index == tensor_index)
+ return operation;
+ }
+
+ return NULL;
+}
+
+static struct rkt_operation *
+find_producer(struct rkt_ml_subgraph *subgraph,
+ unsigned tensor_index)
+{
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+ operation) {
+ if (operation->output_index == tensor_index)
+ return operation;
+ }
+
+ return NULL;
+}
+
+static unsigned
+count_tensors(const struct pipe_ml_operation *poperations,
+ unsigned count)
+{
+ unsigned tensor_count = 0;
+
+ for (unsigned i = 0; i < count; i++) {
+ const struct pipe_ml_operation *poperation = &poperations[i];
+ tensor_count = MAX2(tensor_count, poperation->input_tensors[0]->index);
+ tensor_count = MAX2(tensor_count, poperation->output_tensors[0]->index);
+ switch (poperation->type) {
+ case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
+ tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index);
+ tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index);
+ break;
+ case PIPE_ML_OPERATION_TYPE_ADD:
+ tensor_count = MAX2(tensor_count, poperation->input_tensors[1]->index);
+ break;
+ default:
+ DBG("poperation->type %d\n", poperation->type);
+ unreachable("Unsupported ML operation type");
+ }
+ }
+
+ return tensor_count + 1;
+}
+
+static bool
+tensor_quantization_supported(struct pipe_tensor *tensor)
+{
+ /*
+ * Per-axis quantization not supported, for details see:
+ * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
+ */
+ return tensor->scales == NULL && tensor->zero_points == NULL;
+}
+
+bool
+rkt_ml_operation_supported(struct pipe_context *pcontext,
+ const struct pipe_ml_operation *operation)
+{
+ bool supported = false;
+
+ switch (operation->type) {
+ case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
+ struct pipe_tensor *input_tensor = operation->input_tensors[0];
+ struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
+ struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
+ struct pipe_tensor *output_tensor = operation->output_tensors[0];
+
+ // Dilation and per-axis quantization not yet implemented
+ if (tensor_quantization_supported(input_tensor) &&
+ tensor_quantization_supported(weight_tensor) &&
+ tensor_quantization_supported(bias_tensor) &&
+ tensor_quantization_supported(output_tensor) &&
+ operation->conv.dilation_width_factor == 1 &&
+ operation->conv.dilation_height_factor == 1)
+ supported = true;
+
+ break;
+ }
+ case PIPE_ML_OPERATION_TYPE_ADD:
+ supported = operation->input_tensors[0]->resource == NULL &&
+ operation->input_tensors[1]->resource == NULL;
+ break;
+ default:
+ supported = false;
+ }
+
+ return supported;
+}
+
+struct pipe_ml_subgraph *
+rkt_ml_subgraph_create(struct pipe_context *pcontext,
+ const struct pipe_ml_operation *poperations,
+ unsigned count)
+{
+ struct rkt_ml_subgraph *subgraph;
+ unsigned tensor_count;
+
+ subgraph = calloc(1, sizeof(*subgraph));
+ subgraph->base.context = pcontext;
+
+ tensor_count = count_tensors(poperations, count);
+ util_dynarray_init(&subgraph->tensors, NULL);
+ util_dynarray_init(&subgraph->operations, NULL);
+ if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *,
+ tensor_count))
+ return NULL;
+ memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size);
+
+ /* Lower */
+ for (int i = 0; i < count; i++) {
+ struct rkt_operation operation = {0};
+ operation.add_tensor = -1;
+
+ switch (poperations[i].type) {
+ case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
+ lower_convolution(subgraph, &poperations[i], &operation);
+ util_dynarray_append(&subgraph->operations, struct rkt_operation,
+ operation);
+ break;
+ case PIPE_ML_OPERATION_TYPE_ADD: {
+ /* Fuse tensor addition into convolution*/
+ struct rkt_operation *input_op_1 =
+ find_producer(subgraph, poperations[i].input_tensors[1]->index);
+ struct rkt_operation *input_op_2 =
+ find_producer(subgraph, poperations[i].input_tensors[0]->index);
+
+ assert(input_op_1);
+ assert(input_op_2);
+
+ if (input_op_1 == NULL) {
+ /* Graph input */
+ input_op_2->add_tensor = poperations[i].input_tensors[1]->index;
+ } else {
+ input_op_1->addition_input = true;
+ input_op_2->add_tensor = input_op_1->output_index;
+ }
+
+ input_op_2->output_index = poperations[i].output_tensors[0]->index;
+ input_op_2->addition_offset =
+ 0x80 - poperations[i].input_tensors[1]->zero_point;
+ input_op_2->addition_scale = poperations[i].input_tensors[1]->scale;
+
+ break;
+ }
+ default:
+ DBG("poperation->type %d\n", poperations[i].type);
+ unreachable("Unsupported ML operation type");
+ }
+ }
+
+ /* Create input tensors */
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+ operation) {
+ unsigned input_channels_1 =
+ DIV_ROUND_UP(operation->input_channels, FEATURE_ATOMIC_SIZE) * 2;
+ unsigned input_channels_2 = FEATURE_ATOMIC_SIZE;
+ unsigned input_size = operation->input_width * operation->input_height *
+ input_channels_1 * input_channels_2;
+
+ create_tensor(subgraph, operation->input_index, input_size);
+ }
+
+ /* Create output tensors */
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+ operation) {
+ struct rkt_resource *res =
+ rkt_get_tensor(subgraph, operation->output_index);
+ if (res != NULL)
+ continue;
+
+ create_tensor(subgraph, operation->output_index,
+ calc_raw_output_size(operation));
+ }
+
+ /* Compile */
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+ operation) {
+ rkt_split_tasks(subgraph, operation);
+ compile_operation(subgraph, operation);
+ }
+
+ return &subgraph->base;
+}
+
+void
+rkt_ml_subgraph_invoke(struct pipe_context *pcontext,
+ struct pipe_ml_subgraph *psubgraph,
+ unsigned inputs_count, unsigned input_idxs[],
+ void *inputs[], bool is_signed[])
+{
+ struct rkt_screen *screen = rkt_screen(pcontext->screen);
+ struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph);
+ int ret;
+
+ DBG("Processing input\n");
+
+ for (int i = 0; i < inputs_count; i++) {
+ struct rkt_operation *operation =
+ find_first_consumer(subgraph, input_idxs[i]);
+ struct pipe_resource *input =
+ &rkt_get_tensor(subgraph, input_idxs[i])->base;
+ unsigned input_channels = operation->input_channels;
+ unsigned output_channels = operation->output_channels;
+
+ struct rkt_resource *input_tensor =
+ rkt_get_tensor(subgraph, operation->input_index);
+ if (output_channels == 1 && input_channels == 1 &&
+ !operation->addition_input && (operation->add_tensor == -1)) {
+ pipe_buffer_copy(pcontext, &input_tensor->base, input, 0, 0,
+ pipe_buffer_size(input));
+ } else {
+ unsigned input_width = operation->input_width;
+ unsigned input_height = operation->input_height;
+ unsigned zero_point = operation->input_zero_point;
+ struct pipe_transfer *transfer_out;
+ uint8_t(*input_in)[input_height][input_channels] = inputs[i];
+ uint8_t *map = pipe_buffer_map(pcontext, &input_tensor->base,
+ PIPE_MAP_WRITE, &transfer_out);
+
+ DBG("Converting data\n");
+
+ /*
+ * From the NVDLA docs: "For int8, one element of data refers to an 8-bit
+ * signed integer." But only when transposing do we seem to need to
+ * convert to signed. The DMA unit seems to be able to convert from
+ * unsigned to signed though.
+ */
+ if (input_channels == 1) {
+ unsigned n = 0;
+ for (int x = 0; x < input_width; x++) {
+ for (int y = 0; y < MAX2(input_height, FEATURE_ATOMIC_SIZE); y++) {
+ if (y < input_height)
+ map[n++] = input_in[x][y][0];
+ else
+ map[n++] = zero_point;
+ }
+ }
+ } else {
+ unsigned n = 0;
+ for (int u = 0; u < DIV_ROUND_UP(input_channels, FEATURE_ATOMIC_SIZE);
+ u++) {
+ for (int x = 0; x < input_width; x++) {
+ for (int y = 0; y < input_height; y++) {
+ for (int c = 0; c < FEATURE_ATOMIC_SIZE; c++) {
+ unsigned input_channel = c + u * FEATURE_ATOMIC_SIZE;
+ if (input_channel < input_channels)
+ map[n++] = input_in[x][y][input_channel] - 0x80;
+ else
+ map[n++] = zero_point - 0x80;
+ }
+ }
+ }
+ }
+ }
+
+ if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS))
+ rkt_dump_buffer(map, "input", 0, 0, 0,
+ rkt_get_tensor(subgraph, input_idxs[i])->bo_size);
+
+ DBG("Converted data\n");
+
+ pipe_buffer_unmap(pcontext, transfer_out);
+ }
+ }
+ DBG("Processed input\n");
+
+ DBG("Submitting graph\n");
+
+ struct util_dynarray jobs = {0};
+ util_dynarray_init(&jobs, NULL);
+
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+ operation) {
+ unsigned num_inputs = operation->add_tensor != -1 ? 2 : 1;
+ uint32_t *in_bo_handles = calloc(num_inputs, sizeof(uint32_t));
+ uint32_t *out_bo_handles = malloc(sizeof(uint32_t));
+
+ in_bo_handles[0] = rkt_get_tensor(subgraph, operation->input_index)->handle;
+
+ if (operation->add_tensor != -1)
+ in_bo_handles[1] =
+ rkt_get_tensor(subgraph, operation->add_tensor)->handle;
+
+ out_bo_handles[0] =
+ rkt_get_tensor(subgraph, operation->output_index)->handle;
+
+ if (operation->reuse_weights_cbuf) {
+ /* Submit all tasks to the same core, so weights can be reused */
+ unsigned num_tasks =
+ util_dynarray_num_elements(&operation->tasks, struct split_task);
+ struct drm_rocket_task *tasks = calloc(num_tasks, sizeof(*tasks));
+ unsigned task_count = 0;
+ util_dynarray_foreach (&operation->tasks, struct split_task, task) {
+ tasks[task_count].regcmd = task->regcfg_addr;
+ tasks[task_count].regcmd_count = task->regcfg_amount;
+ task_count++;
+ }
+ struct drm_rocket_job job = {0};
+ job.task_struct_size = sizeof(struct drm_rocket_task);
+ job.in_bo_handles = (uint64_t)(uintptr_t)in_bo_handles;
+ job.in_bo_handle_count = num_inputs;
+ job.out_bo_handles = (uint64_t)(uintptr_t)out_bo_handles;
+ job.out_bo_handle_count = 1;
+ job.tasks = (uint64_t)tasks;
+ job.task_count = task_count;
+ util_dynarray_append(&jobs, struct drm_rocket_job, job);
+ } else {
+ /* Spread tasks among cores, for parallelism */
+ util_dynarray_foreach (&operation->tasks, struct split_task, task) {
+ struct drm_rocket_task *ktask = calloc(1, sizeof(*ktask));
+ ktask->regcmd = task->regcfg_addr;
+ ktask->regcmd_count = task->regcfg_amount;
+
+ struct drm_rocket_job job = {0};
+ job.task_struct_size = sizeof(struct drm_rocket_task);
+ job.in_bo_handles = (uint64_t)(uintptr_t)in_bo_handles;
+ job.in_bo_handle_count = num_inputs;
+ job.out_bo_handles = (uint64_t)(uintptr_t)out_bo_handles;
+ job.out_bo_handle_count = 1;
+ job.tasks = (uint64_t)ktask;
+ job.task_count = 1;
+ util_dynarray_append(&jobs, struct drm_rocket_job, job);
+ }
+ }
+ }
+
+ struct drm_rocket_submit submit = {0};
+ submit.job_struct_size = sizeof(struct drm_rocket_job);
+ submit.jobs = (uint64_t)util_dynarray_begin(&jobs);
+ submit.job_count = util_dynarray_num_elements(&jobs, struct drm_rocket_job);
+
+ ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_SUBMIT, &submit);
+ assert(ret == 0);
+
+ util_dynarray_foreach (&jobs, struct drm_rocket_job, job) {
+ free((void *)job->in_bo_handles);
+ free((void *)job->out_bo_handles);
+ free((void *)job->tasks);
+ }
+ util_dynarray_fini(&jobs);
+
+ DBG("Submitted graph\n");
+}
+
+void
+rkt_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+ struct pipe_ml_subgraph *psubgraph,
+ unsigned outputs_count,
+ unsigned output_idxs[], void *outputs[],
+ bool is_signed[])
+{
+ struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph);
+
+ DBG("Processing output\n");
+
+ for (int i = 0; i < outputs_count; i++) {
+
+ struct rkt_operation *operation = find_producer(subgraph, output_idxs[i]);
+ struct rkt_resource *output_tensor =
+ rkt_get_tensor(subgraph, output_idxs[i]);
+ struct pipe_transfer *transfer = NULL;
+ uint8_t *raw_output;
+ uint8_t(*output_in)[operation->output_height][operation->output_width]
+ [FEATURE_ATOMIC_SIZE];
+ uint8_t(*output_out)[operation->output_width][operation->output_channels];
+
+ DBG("Before pipe_buffer_map\n");
+ raw_output = pipe_buffer_map(pcontext, &output_tensor->base, PIPE_MAP_READ,
+ &transfer);
+ DBG("After pipe_buffer_map\n");
+
+ DBG("Converting data\n");
+
+ output_in = (void *)raw_output;
+ output_out = (void *)outputs[i];
+
+ if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS))
+ rkt_dump_buffer(raw_output, "output", 0, 0, 0, output_tensor->bo_size);
+
+ for (int oc = 0; oc < operation->output_channels; oc++) {
+ for (int x = 0; x < operation->output_width; x++) {
+ for (int y = 0; y < operation->output_height; y++) {
+ unsigned c = oc % FEATURE_ATOMIC_SIZE;
+ unsigned g = oc / FEATURE_ATOMIC_SIZE;
+ output_out[y][x][oc] = output_in[g][y][x][c] + 0x80;
+ }
+ }
+ }
+
+ DBG("Converted data\n");
+
+ pipe_buffer_unmap(pcontext, transfer);
+ }
+
+ DBG("Processed output\n");
+}
+
+static void
+free_operation(struct rkt_operation *operation)
+{
+ util_dynarray_fini(&operation->tasks);
+ pipe_resource_reference(&operation->regcmd, NULL);
+ pipe_resource_reference(&operation->weights, NULL);
+ pipe_resource_reference(&operation->biases, NULL);
+}
+
+void
+rkt_ml_subgraph_destroy(struct pipe_context *context,
+ struct pipe_ml_subgraph *psubgraph)
+{
+ struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph);
+
+ util_dynarray_foreach (&subgraph->operations, struct rkt_operation, operation)
+ free_operation(operation);
+ util_dynarray_fini(&subgraph->operations);
+
+ util_dynarray_foreach (&subgraph->tensors, struct pipe_resource *, tensor)
+ if (tensor)
+ pipe_resource_reference(tensor, NULL);
+ util_dynarray_fini(&subgraph->tensors);
+
+ free(subgraph);
+}
diff --git a/src/gallium/drivers/rocket/rkt_ml.h b/src/gallium/drivers/rocket/rkt_ml.h
new file mode 100644
index 00000000000..04dea3d1475
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_ml.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_ML_H
+#define RKT_ML_H
+
+#include <util/u_dynarray.h>
+
+#include "rkt_device.h"
+
+// http://nvdla.org/hw/v1/ias/unit_description.html#convolution-buffer
+#define CBUF_BANK_SIZE 32768
+#define CBUF_BANKS 12
+#define CBUF_ENTRIES_PER_BANK 256
+#define CBUF_ENTRY_SIZE (CBUF_BANK_SIZE / CBUF_ENTRIES_PER_BANK)
+#define FEATURE_ATOMIC_SIZE 16
+#define WEIGHT_ATOMIC_SIZE 32
+#define ATOMIC_K_SIZE 16
+
+struct split_task {
+ unsigned num;
+
+ unsigned top_slice;
+ unsigned bottom_slice;
+ unsigned num_overlap_slices;
+ unsigned num_retain_slices;
+ unsigned convolutions;
+
+ unsigned pad_top;
+ unsigned pad_bottom;
+ unsigned pad_left;
+ unsigned pad_right;
+
+ unsigned stride_x;
+ unsigned stride_y;
+
+ unsigned input_width;
+ unsigned input_height;
+ unsigned input_channels;
+ unsigned input_channels_real;
+ unsigned input_zero_point;
+ float input_scale;
+ unsigned input_data_entries;
+ int input_line_stride;
+ int input_surface_stride;
+ unsigned input_offset;
+
+ unsigned output_width;
+ unsigned output_height;
+ unsigned output_channels;
+ unsigned output_channels_real;
+ unsigned output_zero_point;
+ float output_scale;
+ int output_surface_stride;
+ unsigned output_offset;
+
+ unsigned weights_width;
+ unsigned weights_height;
+ unsigned weights_kernels;
+ unsigned weights_zero_point;
+ float weights_scale;
+
+ unsigned input_banks;
+ unsigned weights_banks;
+
+ unsigned atomic_count;
+ unsigned surfaces_per_row;
+
+ unsigned regcfg_amount;
+ uint32_t regcfg_addr;
+};
+
+struct rkt_operation {
+ struct pipe_resource *regcmd;
+ struct pipe_resource *weights;
+ struct pipe_resource *biases;
+
+ bool depthwise;
+ bool reuse_weights_cbuf;
+ unsigned truncate_bits;
+ bool padding_same;
+ unsigned stride;
+
+ bool addition_input;
+ int addition_offset;
+ float addition_scale;
+
+ unsigned input_index;
+ unsigned input_width;
+ unsigned input_height;
+ unsigned input_channels;
+ uint8_t input_zero_point;
+ float input_scale;
+
+ unsigned output_index;
+ unsigned output_width;
+ unsigned output_height;
+ unsigned output_channels;
+ uint8_t output_zero_point;
+ float output_scale;
+
+ unsigned weights_width;
+ unsigned weights_height;
+ uint8_t weights_zero_point;
+ float weights_scale;
+
+ int add_tensor;
+
+ struct util_dynarray tasks; /* struct split_task */
+};
+
+struct rkt_ml_subgraph {
+ struct pipe_ml_subgraph base;
+
+ struct util_dynarray operations; /* rkt_operation */
+ struct util_dynarray tensors; /* pipe_resource* */
+};
+
+bool
+rkt_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation);
+
+struct pipe_ml_subgraph *
+rkt_ml_subgraph_create(struct pipe_context *pcontext,
+ const struct pipe_ml_operation *poperations,
+ unsigned count);
+
+void rkt_ml_subgraph_invoke(struct pipe_context *pcontext,
+ struct pipe_ml_subgraph *psubgraph,
+ unsigned inputs_count, unsigned input_idxs[],
+ void *inputs[], bool is_signed[]);
+
+void rkt_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+ struct pipe_ml_subgraph *psubgraph,
+ unsigned outputs_count,
+ unsigned output_idxs[], void *outputs[],
+ bool is_signed[]);
+
+void rkt_ml_subgraph_destroy(struct pipe_context *context,
+ struct pipe_ml_subgraph *psubgraph);
+
+struct rkt_resource *rkt_get_tensor(struct rkt_ml_subgraph *subgraph,
+ unsigned idx);
+
+bool rkt_is_depthwise(const struct pipe_ml_operation *poperation);
+
+void rkt_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+ int suboperation_nr, int offset, unsigned size);
+
+#endif /* RKT_ML_H */
diff --git a/src/gallium/drivers/rocket/rkt_regcmd.c b/src/gallium/drivers/rocket/rkt_regcmd.c
new file mode 100644
index 00000000000..be992fd5069
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_regcmd.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "rkt_regcmd.h"
+#include "rkt_ml.h"
+#include "rkt_registers.h"
+
+static void
+emit_raw(struct util_dynarray *regs, uint32_t target, uint32_t reg,
+ uint32_t value)
+{
+ uint64_t packed_value = 0;
+ packed_value = ((uint64_t)target) << 48;
+ packed_value |= ((uint64_t)value) << 16;
+ packed_value |= (uint64_t)reg;
+
+ util_dynarray_append(regs, uint64_t, packed_value);
+}
+
+static void
+emit(struct util_dynarray *regs, uint32_t reg, uint32_t value)
+{
+ uint32_t target = rkt_get_target(reg) + 0x1;
+ emit_raw(regs, target, reg, value);
+}
+
+#define EMIT(offset, value) emit(regs, offset, value);
+
+static void
+fill_first_regcmd(struct rkt_ml_subgraph *subgraph,
+ const struct rkt_operation *operation,
+ struct util_dynarray *regs, unsigned task_num)
+{
+ struct split_task *task =
+ util_dynarray_element(&operation->tasks, struct split_task, task_num);
+ unsigned num_tasks =
+ util_dynarray_num_elements(&operation->tasks, struct split_task);
+ unsigned output_zero_point = task->output_zero_point;
+ unsigned weights_zero_point = task->weights_zero_point;
+ unsigned offset = output_zero_point - 0x80;
+
+ uint32_t con0 = CNA_CBUF_CON0_WEIGHT_BANK(task->weights_banks) |
+ CNA_CBUF_CON0_DATA_BANK(task->input_banks);
+ if (task_num > 0 && operation->reuse_weights_cbuf)
+ con0 |= CNA_CBUF_CON0_WEIGHT_REUSE(1);
+
+ EMIT(REG_CNA_CBUF_CON0, con0);
+
+ EMIT(REG_CNA_DCOMP_REGNUM, 0);
+ EMIT(REG_CNA_DCOMP_CTRL, 0);
+
+ uint32_t con1 = 0x0;
+ if (task->input_channels_real == 1) {
+ con1 |= CNA_CONV_CON1_NONALIGN_DMA(1) | CNA_CONV_CON1_GROUP_LINE_OFF(1) |
+ CNA_CONV_CON1_ARGB_IN(8);
+ }
+
+ if (operation->depthwise)
+ con1 |= CNA_CONV_CON1_CONV_MODE(3);
+
+ EMIT(REG_CNA_CONV_CON1, con1);
+
+ EMIT(REG_DPU_S_POINTER, DPU_S_POINTER_POINTER_PP_MODE(1) |
+ DPU_S_POINTER_EXECUTER_PP_EN(1) |
+ DPU_S_POINTER_POINTER_PP_EN(1));
+ EMIT(REG_DPU_RDMA_RDMA_S_POINTER,
+ DPU_RDMA_RDMA_S_POINTER_POINTER_PP_MODE(1) |
+ DPU_RDMA_RDMA_S_POINTER_EXECUTER_PP_EN(1) |
+ DPU_RDMA_RDMA_S_POINTER_POINTER_PP_EN(1));
+ EMIT(REG_CNA_CONV_CON1, con1);
+ EMIT(REG_CNA_CONV_CON2,
+ CNA_CONV_CON2_FEATURE_GRAINS(
+ 50 + task->stride_y + 1)); /* Magic: Seems to pass the most tests */
+ EMIT(REG_CNA_CONV_CON3, CNA_CONV_CON3_CONV_X_STRIDE(task->stride_x) |
+ CNA_CONV_CON3_CONV_Y_STRIDE(task->stride_y));
+ EMIT(REG_CNA_DATA_SIZE0,
+ CNA_DATA_SIZE0_DATAIN_WIDTH(task->input_width) |
+ CNA_DATA_SIZE0_DATAIN_HEIGHT(task->input_height));
+
+ EMIT(REG_CNA_DATA_SIZE1,
+ CNA_DATA_SIZE1_DATAIN_CHANNEL_REAL(task->input_channels_real - 1) |
+ CNA_DATA_SIZE1_DATAIN_CHANNEL(task->input_channels));
+
+ EMIT(REG_CNA_DATA_SIZE2, CNA_DATA_SIZE2_DATAOUT_WIDTH(task->output_width));
+ EMIT(REG_CNA_DATA_SIZE3, CNA_DATA_SIZE3_DATAOUT_ATOMICS(task->atomic_count));
+ EMIT(REG_CNA_WEIGHT_SIZE0, task->weights_width * task->weights_height *
+ task->input_channels * task->weights_kernels);
+ EMIT(REG_CNA_WEIGHT_SIZE1,
+ task->weights_width * task->weights_height * task->input_channels);
+ EMIT(REG_CNA_WEIGHT_SIZE2,
+ CNA_WEIGHT_SIZE2_WEIGHT_WIDTH(task->weights_width) |
+ CNA_WEIGHT_SIZE2_WEIGHT_HEIGHT(task->weights_height) |
+ CNA_WEIGHT_SIZE2_WEIGHT_KERNELS(task->weights_kernels));
+
+ EMIT(REG_CNA_CBUF_CON0, con0);
+
+ EMIT(REG_CNA_CBUF_CON1, CNA_CBUF_CON1_DATA_ENTRIES(task->input_data_entries));
+
+ if (task->input_channels_real == 1) {
+ unsigned truncate = 14;
+ unsigned scale = 16384;
+ unsigned offset = 65408;
+
+ if (operation->addition_input || operation->add_tensor != -1) {
+ truncate = 15;
+ scale = 32388;
+ }
+
+ EMIT(REG_CNA_CVT_CON0, CNA_CVT_CON0_CVT_TRUNCATE_3(truncate) |
+ CNA_CVT_CON0_CVT_TRUNCATE_2(truncate) |
+ CNA_CVT_CON0_CVT_TRUNCATE_1(truncate) |
+ CNA_CVT_CON0_CVT_TRUNCATE_0(truncate));
+ EMIT(REG_CNA_CVT_CON1,
+ CNA_CVT_CON1_CVT_SCALE0(scale) | CNA_CVT_CON1_CVT_OFFSET0(offset));
+ EMIT(REG_CNA_CVT_CON2,
+ CNA_CVT_CON2_CVT_SCALE1(scale) | CNA_CVT_CON2_CVT_OFFSET1(offset));
+ EMIT(REG_CNA_CVT_CON3,
+ CNA_CVT_CON3_CVT_SCALE2(scale) | CNA_CVT_CON3_CVT_OFFSET2(offset));
+ EMIT(REG_CNA_CVT_CON4,
+ CNA_CVT_CON4_CVT_SCALE3(scale) | CNA_CVT_CON4_CVT_OFFSET3(offset));
+ } else {
+ EMIT(REG_CNA_CVT_CON0, CNA_CVT_CON0_DATA_SIGN(1) |
+ CNA_CVT_CON0_CVT_TYPE(1) |
+ CNA_CVT_CON0_CVT_BYPASS(1));
+ EMIT(REG_CNA_CVT_CON1, CNA_CVT_CON1_CVT_SCALE0(1));
+ EMIT(REG_CNA_CVT_CON2, CNA_CVT_CON2_CVT_SCALE1(1));
+ EMIT(REG_CNA_CVT_CON3, CNA_CVT_CON3_CVT_SCALE2(1));
+ EMIT(REG_CNA_CVT_CON4, CNA_CVT_CON4_CVT_SCALE3(1));
+ }
+
+ EMIT(REG_CNA_FC_CON0, 0);
+ EMIT(REG_CNA_FC_CON1, 0);
+ EMIT(REG_CNA_PAD_CON0, CNA_PAD_CON0_PAD_LEFT(task->pad_left) |
+ CNA_PAD_CON0_PAD_TOP(task->pad_top));
+ EMIT(REG_CNA_FEATURE_DATA_ADDR,
+ rkt_get_tensor(subgraph, operation->input_index)->phys_addr +
+ task->input_offset);
+ EMIT(REG_CNA_FC_CON2, 0);
+ EMIT(REG_CNA_DMA_CON0,
+ CNA_DMA_CON0_WEIGHT_BURST_LEN(15) | CNA_DMA_CON0_DATA_BURST_LEN(15));
+ EMIT(REG_CNA_DMA_CON1, CNA_DMA_CON1_LINE_STRIDE(task->input_line_stride));
+ EMIT(REG_CNA_DMA_CON2, CNA_DMA_CON2_SURF_STRIDE(task->input_surface_stride));
+
+ EMIT(REG_CNA_FC_DATA_SIZE0,
+ CNA_FC_DATA_SIZE0_DMA_WIDTH(operation->input_width) |
+ CNA_FC_DATA_SIZE0_DMA_HEIGHT(task->input_height));
+
+ EMIT(REG_CNA_FC_DATA_SIZE1,
+ CNA_FC_DATA_SIZE1_DMA_CHANNEL(task->input_channels));
+ EMIT(REG_CNA_DCOMP_CTRL, 0);
+ EMIT(REG_CNA_DCOMP_REGNUM, 0);
+ EMIT(REG_CNA_DCOMP_ADDR0, rkt_resource(operation->weights)->phys_addr);
+ EMIT(REG_CNA_DCOMP_AMOUNT0, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT1, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT2, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT3, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT4, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT5, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT6, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT7, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT8, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT9, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT10, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT11, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT12, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT13, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT14, 0);
+ EMIT(REG_CNA_DCOMP_AMOUNT15, 0);
+
+ if (task->input_channels_real == 1) {
+ EMIT(REG_CNA_CVT_CON5, 65535);
+ } else {
+ EMIT(REG_CNA_CVT_CON5, 0);
+ }
+
+ int32_t pad_con1;
+ if (task->weights_width >= 3 && task->input_zero_point == 0x0)
+ pad_con1 = 0xffff8080;
+ else
+ pad_con1 = task->input_zero_point - 0x80;
+
+ if (operation->addition_input || operation->add_tensor != -1)
+ pad_con1 = 0xffffff80;
+
+ if (operation->depthwise && task->input_zero_point == 0x8b)
+ pad_con1 = 0x0b0b;
+
+ EMIT(REG_CNA_PAD_CON1, pad_con1);
+
+ uint32_t misc_cfg = CORE_MISC_CFG_QD_EN(1);
+ if (operation->depthwise)
+ misc_cfg |= CORE_MISC_CFG_DW_EN(1);
+
+ EMIT(REG_CORE_MISC_CFG, misc_cfg);
+ EMIT(REG_CORE_DATAOUT_SIZE_0,
+ CORE_DATAOUT_SIZE_0_DATAOUT_HEIGHT(task->output_height - 1) |
+ CORE_DATAOUT_SIZE_0_DATAOUT_WIDTH(task->output_width - 1));
+ EMIT(REG_CORE_DATAOUT_SIZE_1,
+ CORE_DATAOUT_SIZE_1_DATAOUT_CHANNEL(task->output_channels - 1));
+ EMIT(REG_CORE_CLIP_TRUNCATE,
+ CORE_CLIP_TRUNCATE_CLIP_TRUNCATE(operation->truncate_bits));
+ emit_raw(regs, CORE | 0x1, 0x3030, 0);
+
+ uint32_t feat_mode_cfg =
+ DPU_FEATURE_MODE_CFG_BURST_LEN(15) | DPU_FEATURE_MODE_CFG_OUTPUT_MODE(2);
+ if (operation->depthwise)
+ feat_mode_cfg |= DPU_FEATURE_MODE_CFG_CONV_MODE(3);
+
+ EMIT(REG_DPU_FEATURE_MODE_CFG, feat_mode_cfg);
+ EMIT(REG_DPU_DATA_FORMAT, 0);
+ EMIT(REG_DPU_OFFSET_PEND, 0);
+ EMIT(REG_DPU_DST_BASE_ADDR,
+ rkt_get_tensor(subgraph, operation->output_index)->phys_addr +
+ task->output_offset);
+ EMIT(REG_DPU_DST_SURF_STRIDE,
+ DPU_DST_SURF_STRIDE_DST_SURF_STRIDE(task->output_surface_stride));
+ EMIT(REG_DPU_DATA_CUBE_WIDTH,
+ DPU_DATA_CUBE_WIDTH_WIDTH(task->output_width - 1));
+ EMIT(REG_DPU_DATA_CUBE_HEIGHT,
+ DPU_DATA_CUBE_HEIGHT_HEIGHT(task->output_height - 1));
+ EMIT(REG_DPU_DATA_CUBE_NOTCH_ADDR, 0);
+ EMIT(REG_DPU_DATA_CUBE_CHANNEL,
+ DPU_DATA_CUBE_CHANNEL_ORIG_CHANNEL(task->output_channels_real - 1) |
+ DPU_DATA_CUBE_CHANNEL_CHANNEL(task->output_channels - 1));
+ EMIT(REG_DPU_BS_CFG, DPU_BS_CFG_BS_ALU_ALGO(2) | DPU_BS_CFG_BS_ALU_SRC(1) |
+ DPU_BS_CFG_BS_RELU_BYPASS(1) |
+ DPU_BS_CFG_BS_MUL_BYPASS(1));
+ EMIT(REG_DPU_BS_ALU_CFG, 0);
+ EMIT(REG_DPU_BS_MUL_CFG, 0);
+ EMIT(REG_DPU_BS_RELUX_CMP_VALUE, 0);
+
+ if (operation->depthwise) {
+ EMIT(REG_DPU_BS_OW_CFG, DPU_BS_OW_CFG_SIZE_E_2(3) |
+ DPU_BS_OW_CFG_SIZE_E_1(3) |
+ DPU_BS_OW_CFG_SIZE_E_0(3));
+ } else {
+ EMIT(REG_DPU_BS_OW_CFG, DPU_BS_OW_CFG_SIZE_E_2(1) |
+ DPU_BS_OW_CFG_SIZE_E_1(1) |
+ DPU_BS_OW_CFG_SIZE_E_0(1));
+ }
+
+ EMIT(REG_DPU_BS_OW_OP, DPU_BS_OW_OP_OW_OP(0x80 - weights_zero_point));
+
+ EMIT(REG_DPU_WDMA_SIZE_0,
+ DPU_WDMA_SIZE_0_CHANNEL_WDMA(task->output_channels - 1));
+ EMIT(REG_DPU_WDMA_SIZE_1,
+ DPU_WDMA_SIZE_1_HEIGHT_WDMA(task->output_height - 1) |
+ DPU_WDMA_SIZE_1_WIDTH_WDMA(task->output_width - 1));
+ EMIT(REG_DPU_BN_CFG,
+ DPU_BN_CFG_BN_RELU_BYPASS(1) | DPU_BN_CFG_BN_MUL_BYPASS(1) |
+ DPU_BN_CFG_BN_ALU_BYPASS(1) | DPU_BN_CFG_BN_BYPASS(1));
+ EMIT(REG_DPU_BN_ALU_CFG, 0);
+ EMIT(REG_DPU_BN_MUL_CFG, 0);
+ EMIT(REG_DPU_BN_RELUX_CMP_VALUE, 0);
+
+ if (operation->add_tensor != -1) {
+ EMIT(REG_DPU_EW_CFG,
+ DPU_EW_CFG_EW_CVT_TYPE(1) | DPU_EW_CFG_EW_DATA_MODE(1) |
+ DPU_EW_CFG_EDATA_SIZE(1) | DPU_EW_CFG_EW_ALU_ALGO(2) |
+ DPU_EW_CFG_EW_RELU_BYPASS(1) | DPU_EW_CFG_EW_LUT_BYPASS(1) |
+ DPU_EW_CFG_EW_OP_SRC(1));
+
+ /* See http://nvdla.org/hw/v1/ias/precision.html#element-wise */
+ EMIT(REG_DPU_EW_CVT_OFFSET_VALUE, operation->addition_offset);
+
+ float add_scale = 0.0;
+ if (fabs(operation->addition_scale - 0.090192) < 0.00001) {
+ add_scale = 299.671889248;
+ } else if (fabs(operation->addition_scale - 0.399250) < 0.00001) {
+ add_scale = 1326.499209406;
+ } else if (fabs(operation->addition_scale - 0.364902) < 0.00001) {
+ add_scale = 780.34375;
+ } else if (fabs(operation->addition_scale - 0.422037) < 0.00001) {
+ add_scale = 715.5625;
+ } else if (fabs(operation->addition_scale - 0.213016) < 0.00001) {
+ add_scale = 564.6875;
+ } else if (fabs(operation->addition_scale - 0.244231) < 0.00001) {
+ add_scale = 499.796875;
+ } else if (fabs(operation->addition_scale - 0.283416) < 0.00001) {
+ add_scale = 488.203125;
+ } else if (fabs(operation->addition_scale - 0.171151) < 0.00001) {
+ add_scale = 602.90625;
+ } else if (fabs(operation->addition_scale - 0.164588) < 0.00001) {
+ add_scale = 271.921875;
+ } else if (fabs(operation->addition_scale - 0.204098) < 0.00001) {
+ add_scale = 262.90625;
+ } else if (fabs(operation->addition_scale - 0.116532) < 0.00001) {
+ add_scale = 450.140625;
+ } else if (fabs(operation->addition_scale - 0.134499) < 0.00001) {
+ add_scale = 212.1953125;
+ } else if (fabs(operation->addition_scale - 0.220141) < 0.00001) {
+ add_scale = 368.28125;
+ } else if (fabs(operation->addition_scale - 0.094560) < 0.00001) {
+ add_scale = 416.421875;
+ } else if (fabs(operation->addition_scale - 0.093230) < 0.00001) {
+ add_scale = 305.421875;
+ } else if (fabs(operation->addition_scale - 0.100618) < 0.00001) {
+ add_scale = 313.671875;
+ } else {
+ add_scale = 0.0;
+ }
+
+ uint32_t add_scale_bits = fui(add_scale);
+ /* Taken from
+ * https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130
+ */
+ unsigned add_shift = 127 + 31 - 32 - (add_scale_bits >> 23) + 16;
+
+ unsigned scale = ((add_scale_bits >> 9) & 0x7fff);
+ if (scale < 1 << 14)
+ scale |= 1 << 14;
+
+ EMIT(REG_DPU_EW_CVT_SCALE_VALUE,
+ DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SHIFT(add_shift - 1) |
+ DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SCALE(scale));
+
+ EMIT(REG_DPU_EW_RELUX_CMP_VALUE, 0x0);
+
+ if (fabs(operation->addition_scale - 0.213016) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x4);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(25914));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.244231) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(28927));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.283416) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x6);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(26050));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.171151) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffffd);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(28937));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.164588) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(24877));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+ } else if (fabs(operation->addition_scale - 0.204098) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x0);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(23272));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+ } else if (fabs(operation->addition_scale - 0.116532) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffff8);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(32292));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.134499) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffffb);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(24153));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+ } else if (fabs(operation->addition_scale - 0.220141) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0xb);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(27655));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.094560) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x5);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(20432));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+ } else if (fabs(operation->addition_scale - 0.093230) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0xffffffff);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(25449));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+ } else if (fabs(operation->addition_scale - 0.100618) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, offset);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(16874));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+ } else if (fabs(operation->addition_scale - 0.422037) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(22559));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else if (fabs(operation->addition_scale - 0.364902) < 0.00001) {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x4);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(18589));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+ } else {
+ EMIT(REG_DPU_OUT_CVT_OFFSET, 0x6);
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(27676));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(25));
+ }
+ } else {
+ EMIT(REG_DPU_EW_CFG,
+ DPU_EW_CFG_EW_RELU_BYPASS(1) | DPU_EW_CFG_EW_OP_CVT_BYPASS(1) |
+ DPU_EW_CFG_EW_LUT_BYPASS(1) | DPU_EW_CFG_EW_OP_BYPASS(1) |
+ DPU_EW_CFG_EW_BYPASS(1));
+ EMIT(REG_DPU_EW_CVT_OFFSET_VALUE, 0);
+ EMIT(REG_DPU_EW_CVT_SCALE_VALUE, DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SCALE(1));
+ EMIT(REG_DPU_EW_RELUX_CMP_VALUE, 0);
+ EMIT(REG_DPU_OUT_CVT_OFFSET, offset);
+
+ float conv_scale =
+ (task->input_scale * task->weights_scale) / task->output_scale;
+ // DBG("conv_scale %f\n", conv_scale);
+ uint32_t scale_bits = fui(conv_scale);
+ /* Taken from
+ * https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130
+ */
+ unsigned shift = 127 + 31 - 32 - (scale_bits >> 23) + 16;
+
+ if (operation->truncate_bits > 0)
+ shift--;
+
+ unsigned scale = ((scale_bits >> 9) & 0x7fff) + 1;
+ if (scale < 1 << 14)
+ scale |= 1 << 14;
+
+ EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(scale));
+ EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(shift - 1));
+ }
+
+ EMIT(REG_DPU_EW_OP_VALUE_0, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_1, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_2, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_3, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_4, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_5, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_6, 0);
+ EMIT(REG_DPU_EW_OP_VALUE_7, 0);
+ EMIT(REG_DPU_SURFACE_ADD, DPU_SURFACE_ADD_SURF_ADD(task->surfaces_per_row));
+ emit_raw(regs, DPU | 0x1, 0x40c4, 0);
+ EMIT(REG_DPU_LUT_ACCESS_CFG, 0);
+ EMIT(REG_DPU_LUT_ACCESS_DATA, 0);
+ EMIT(REG_DPU_LUT_CFG, 0);
+ EMIT(REG_DPU_LUT_INFO, 0);
+ EMIT(REG_DPU_LUT_LE_START, 0);
+ EMIT(REG_DPU_LUT_LE_END, 0);
+ EMIT(REG_DPU_LUT_LO_START, 0);
+ EMIT(REG_DPU_LUT_LO_END, 0);
+ EMIT(REG_DPU_LUT_LE_SLOPE_SCALE, 0);
+ EMIT(REG_DPU_LUT_LE_SLOPE_SHIFT, 0);
+ EMIT(REG_DPU_LUT_LO_SLOPE_SCALE, 0);
+ EMIT(REG_DPU_LUT_LO_SLOPE_SHIFT, 0);
+ EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_WIDTH,
+ DPU_RDMA_RDMA_DATA_CUBE_WIDTH_WIDTH(task->output_width - 1));
+ EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_HEIGHT,
+ DPU_RDMA_RDMA_DATA_CUBE_HEIGHT_HEIGHT(task->output_height - 1));
+ EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_CHANNEL,
+ DPU_RDMA_RDMA_DATA_CUBE_CHANNEL_CHANNEL(task->output_channels - 1));
+
+ if (operation->add_tensor != -1) {
+ EMIT(REG_DPU_RDMA_RDMA_SRC_BASE_ADDR,
+ rkt_get_tensor(subgraph, operation->add_tensor)->phys_addr +
+ task->output_offset);
+ } else {
+ EMIT(REG_DPU_RDMA_RDMA_SRC_BASE_ADDR, 0);
+ }
+
+ EMIT(REG_DPU_RDMA_RDMA_BRDMA_CFG, DPU_RDMA_RDMA_BRDMA_CFG_BRDMA_DATA_USE(1));
+ EMIT(REG_DPU_RDMA_RDMA_BS_BASE_ADDR,
+ rkt_resource(operation->biases)->phys_addr);
+ EMIT(REG_DPU_RDMA_RDMA_NRDMA_CFG, 0);
+ EMIT(REG_DPU_RDMA_RDMA_BN_BASE_ADDR, 0);
+
+ unsigned ew_stride =
+ MAX2(operation->output_width * operation->output_height, 12);
+
+ if (operation->add_tensor != -1) {
+ EMIT(REG_DPU_RDMA_RDMA_ERDMA_CFG,
+ DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DATA_MODE(1) |
+ DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DATA_SIZE(1));
+ unsigned ew_base_offset =
+ operation->output_width * operation->output_height * ATOMIC_K_SIZE;
+ EMIT(REG_DPU_RDMA_RDMA_EW_BASE_ADDR,
+ rkt_get_tensor(subgraph, operation->add_tensor)->phys_addr +
+ task->output_offset + ew_base_offset);
+ EMIT(REG_DPU_RDMA_RDMA_EW_SURF_STRIDE,
+ DPU_RDMA_RDMA_EW_SURF_STRIDE_EW_SURF_STRIDE(ew_stride));
+ } else {
+ EMIT(REG_DPU_RDMA_RDMA_ERDMA_CFG, DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DISABLE(1));
+ EMIT(REG_DPU_RDMA_RDMA_EW_BASE_ADDR, 0);
+ EMIT(REG_DPU_RDMA_RDMA_EW_SURF_STRIDE, 0);
+ }
+
+ uint32_t rdma_feat_mode_cfg = 0x0;
+
+ if (operation->add_tensor != -1) {
+ rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_BURST_LEN(15) |
+ DPU_RDMA_RDMA_FEATURE_MODE_CFG_COMB_USE(5);
+ } else {
+ rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_BURST_LEN(15) |
+ DPU_RDMA_RDMA_FEATURE_MODE_CFG_MRDMA_DISABLE(1);
+ }
+
+ if (operation->depthwise)
+ rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_CONV_MODE(3);
+
+ EMIT(REG_DPU_RDMA_RDMA_FEATURE_MODE_CFG, rdma_feat_mode_cfg);
+ EMIT(REG_DPU_RDMA_RDMA_SRC_DMA_CFG, 0);
+
+ unsigned surf_notch =
+ ew_stride +
+ task->output_width * (operation->output_height - task->output_height);
+
+ if (operation->input_width == 3) {
+ surf_notch = 15;
+ }
+
+ if (operation->add_tensor != -1) {
+ EMIT(REG_DPU_RDMA_RDMA_SURF_NOTCH,
+ DPU_RDMA_RDMA_SURF_NOTCH_SURF_NOTCH_ADDR(surf_notch));
+ } else {
+ EMIT(REG_DPU_RDMA_RDMA_SURF_NOTCH, 0);
+ }
+
+ EMIT(REG_DPU_RDMA_RDMA_PAD_CFG, 0);
+ EMIT(REG_DPU_RDMA_RDMA_WEIGHT,
+ DPU_RDMA_RDMA_WEIGHT_E_WEIGHT(1) | DPU_RDMA_RDMA_WEIGHT_N_WEIGHT(1) |
+ DPU_RDMA_RDMA_WEIGHT_B_WEIGHT(1) | DPU_RDMA_RDMA_WEIGHT_M_WEIGHT(1));
+
+ if (operation->add_tensor != -1) {
+ EMIT(REG_DPU_RDMA_RDMA_EW_SURF_NOTCH,
+ DPU_RDMA_RDMA_EW_SURF_NOTCH_EW_SURF_NOTCH(surf_notch));
+ } else {
+ EMIT(REG_DPU_RDMA_RDMA_EW_SURF_NOTCH, 0x0);
+ }
+
+ if (num_tasks == 1)
+ util_dynarray_append(regs, uint64_t, 0x0);
+ else
+ EMIT(REG_PC_BASE_ADDRESS, 0);
+
+ EMIT(REG_PC_REGISTER_AMOUNTS, 0);
+
+ /* TRM: before op_en, 64'h0041_xxxx_xxxx_xxxx must be set. */
+ util_dynarray_append(regs, uint64_t, 0x0041000000000000);
+
+ /* TRM: 64'h0081_0000_007f_0008 will set each block's op_en(CNA, CORE, ...,
+ * PPU_RDMA). */
+ emit_raw(regs, 0x81, REG_PC_OPERATION_ENABLE,
+ PC_OPERATION_ENABLE_RESERVED_0(14) | PC_OPERATION_ENABLE_OP_EN(1));
+}
+
+void
+rkt_fill_regcmd(struct rkt_ml_subgraph *subgraph,
+ const struct rkt_operation *operation,
+ struct util_dynarray *regs, unsigned task_num)
+{
+ /*
+ * TODO: We should only need to set all the registers on the regcmd for the first
+ * task in an operation, but for now set them all to be sure.
+ */
+ fill_first_regcmd(subgraph, operation, regs, task_num);
+} \ No newline at end of file
diff --git a/src/gallium/drivers/rocket/rkt_regcmd.h b/src/gallium/drivers/rocket/rkt_regcmd.h
new file mode 100644
index 00000000000..ee755e78a97
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_regcmd.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_REGCMD_H
+#define RKT_REGCMD_H
+
+#include "rkt_ml.h"
+
+void rkt_fill_regcmd(struct rkt_ml_subgraph *subgraph,
+ const struct rkt_operation *operation,
+ struct util_dynarray *regs, unsigned task_num);
+
+#endif /* RKT_REGCMD_H */
diff --git a/src/gallium/drivers/rocket/rkt_task.c b/src/gallium/drivers/rocket/rkt_task.c
new file mode 100644
index 00000000000..6dbb2784f40
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_task.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "rkt_task.h"
+#include "rkt_ml.h"
+
+static unsigned
+calc_entries_per_slice(struct rkt_operation *operation)
+{
+ unsigned bpe = sizeof(uint8_t);
+ unsigned atomics_per_entry = CBUF_ENTRY_SIZE / FEATURE_ATOMIC_SIZE;
+ unsigned total_c_atomics =
+ DIV_ROUND_UP(operation->input_channels * bpe, FEATURE_ATOMIC_SIZE);
+ unsigned last_c_atomics = total_c_atomics % atomics_per_entry;
+ unsigned int_c_entries =
+ (total_c_atomics / atomics_per_entry) * operation->input_width;
+ unsigned frac_c_entries =
+ (last_c_atomics == 3)
+ ? operation->input_width
+ : DIV_ROUND_UP(last_c_atomics * operation->input_width,
+ atomics_per_entry);
+
+ return int_c_entries + frac_c_entries;
+}
+
+static unsigned
+calc_input_banks(struct rkt_operation *operation)
+{
+ unsigned entries_per_slice = calc_entries_per_slice(operation);
+ return DIV_ROUND_UP(entries_per_slice * operation->input_height,
+ CBUF_ENTRIES_PER_BANK);
+}
+
+static unsigned
+calc_weights_banks(struct rkt_operation *operation)
+{
+ unsigned bpe = sizeof(uint8_t);
+ unsigned bytes = operation->weights_width * operation->weights_height *
+ operation->input_channels * bpe;
+ unsigned entries;
+ unsigned banks;
+
+ if (!operation->depthwise)
+ bytes *= operation->output_channels;
+ entries = DIV_ROUND_UP(bytes, CBUF_ENTRY_SIZE);
+ banks = DIV_ROUND_UP(entries, CBUF_ENTRIES_PER_BANK);
+
+ /* Why do we need an extra bank? The calc above might be wrong on this HW */
+ banks++;
+
+ return banks;
+}
+
+static unsigned
+calc_line_stride(unsigned width)
+{
+ return width * ATOMIC_K_SIZE * sizeof(uint8_t);
+}
+
+static void
+calc_explicit_padding(const struct rkt_operation *operation,
+ unsigned *pad_top, unsigned *pad_bottom,
+ unsigned *pad_left, unsigned *pad_right)
+{
+ if (operation->padding_same && operation->weights_width > 1) {
+ /* Convert from implicit to explicit padding */
+ unsigned pad_along_width =
+ MAX2((operation->output_width - 1) * operation->stride +
+ operation->weights_width - operation->input_width,
+ 0);
+ unsigned pad_along_height =
+ MAX2((operation->output_height - 1) * operation->stride +
+ operation->weights_height - operation->input_height,
+ 0);
+ *pad_left = pad_along_height / 2;
+ *pad_right = pad_along_height - *pad_left;
+ *pad_top = pad_along_width / 2;
+ *pad_bottom = pad_along_width - *pad_top;
+ } else {
+ *pad_left = 0;
+ *pad_right = 0;
+ *pad_top = 0;
+ *pad_bottom = 0;
+ }
+}
+
+static void
+fill_task(struct rkt_ml_subgraph *subgraph,
+ struct rkt_operation *operation,
+ struct split_task *task)
+{
+ task->stride_x = operation->stride;
+ task->stride_y = operation->stride;
+
+ task->input_width = operation->input_width;
+ if (task->input_width == 8 &&
+ (operation->addition_input || operation->add_tensor != -1))
+ task->input_width *= 2;
+
+ task->input_height = operation->input_height;
+ task->input_channels =
+ ALIGN(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE),
+ FEATURE_ATOMIC_SIZE);
+ task->input_channels_real = operation->input_channels;
+ task->input_zero_point = operation->input_zero_point;
+ task->input_scale = operation->input_scale;
+
+ task->output_width = operation->output_width;
+ task->output_height = operation->output_height;
+
+ task->output_channels_real = operation->output_channels;
+ task->output_channels = ALIGN(MAX2(operation->output_channels, 32), 32);
+ if (operation->depthwise) {
+ if (task->output_channels_real <= 32)
+ task->output_channels *= 2;
+ task->output_channels = ALIGN(task->output_channels, 64);
+ }
+
+ task->output_zero_point = operation->output_zero_point;
+ task->output_scale = operation->output_scale;
+
+ if (task->input_channels_real == 1 &&
+ (task->output_channels_real > 1 ||
+ (operation->addition_input || operation->add_tensor != -1))) {
+ task->input_width = MAX2(task->input_width, FEATURE_ATOMIC_SIZE);
+ task->input_line_stride =
+ MAX2(calc_line_stride(operation->input_width) / FEATURE_ATOMIC_SIZE,
+ FEATURE_ATOMIC_SIZE);
+
+ if (operation->input_channels == 32 && operation->input_width == 80) {
+ task->input_line_stride *= 4;
+ task->input_surface_stride = (float)task->input_line_stride *
+ (((float)task->input_height / 4) - 1);
+ } else
+ task->input_surface_stride =
+ (float)task->input_line_stride * (((float)task->input_height) - 1);
+ } else {
+ task->input_line_stride = calc_line_stride(operation->input_width) / 4;
+ task->input_surface_stride =
+ (float)task->input_line_stride * (((float)task->input_height / 4) - 1);
+ }
+
+ if (task->input_width == 8 &&
+ (operation->addition_input || operation->add_tensor != -1)) {
+ task->input_line_stride /= 2;
+ task->input_surface_stride = 112;
+ }
+
+ int output_line_stride = calc_line_stride(operation->output_width);
+ task->output_surface_stride = output_line_stride * task->output_height;
+ task->output_surface_stride /= FEATURE_ATOMIC_SIZE;
+
+ if (task->input_channels_real == 1)
+ task->input_data_entries = task->input_width * task->input_height;
+ else if (task->input_width == 40 && task->input_channels_real == 40)
+ task->input_data_entries = 40;
+ else
+ task->input_data_entries = DIV_ROUND_UP(
+ task->input_width * 2 *
+ DIV_ROUND_UP(task->input_channels_real, FEATURE_ATOMIC_SIZE),
+ 8);
+
+ task->weights_width = operation->weights_width;
+ task->weights_height = operation->weights_height;
+ task->weights_zero_point = operation->weights_zero_point;
+ task->weights_scale = operation->weights_scale;
+
+ if (operation->depthwise)
+ task->weights_kernels = 1;
+ else
+ task->weights_kernels = ALIGN(operation->output_channels, 2);
+
+ task->surfaces_per_row = task->output_width * task->output_height * 2;
+ if (operation->depthwise)
+ task->surfaces_per_row *= 2;
+}
+
+void
+rkt_split_tasks(struct rkt_ml_subgraph *subgraph,
+ struct rkt_operation *operation)
+{
+ /* Function mostly taken from NVDLA */
+ unsigned entries_per_slice = calc_entries_per_slice(operation);
+ unsigned input_banks_required = calc_input_banks(operation);
+ unsigned weights_banks_required = calc_weights_banks(operation);
+ unsigned available_weights_banks = weights_banks_required;
+ unsigned available_input_banks = CBUF_BANKS - weights_banks_required;
+ unsigned pad_top;
+ unsigned pad_bottom;
+ unsigned pad_left;
+ unsigned pad_right;
+
+ calc_explicit_padding(operation, &pad_top, &pad_bottom, &pad_left,
+ &pad_right);
+
+ if (weights_banks_required + 1 < CBUF_BANKS) {
+ /* Full weights, partial input */
+ operation->reuse_weights_cbuf = true;
+ } else {
+ /* Partial weights, partial input */
+ operation->reuse_weights_cbuf = false;
+ available_input_banks = 7;
+ available_weights_banks = CBUF_BANKS - available_input_banks;
+ }
+
+ if (input_banks_required <= available_input_banks) {
+ /* Full weights, full input */
+
+ struct split_task task = {0};
+
+ task.num = 0;
+ fill_task(subgraph, operation, &task);
+ task.input_banks = input_banks_required;
+ task.weights_banks = CBUF_BANKS - task.input_banks;
+ task.input_height = operation->input_height;
+
+ task.pad_top = pad_top;
+ task.pad_bottom = pad_bottom;
+ task.pad_left = pad_left;
+ task.pad_right = pad_right;
+
+ task.atomic_count = task.output_width * task.output_height;
+
+ util_dynarray_append(&operation->tasks, struct split_task, task);
+
+ return;
+ }
+
+ struct split_task task = {0};
+ unsigned available_slices =
+ (CBUF_ENTRIES_PER_BANK * available_input_banks) / entries_per_slice;
+
+ task.num = 0;
+ fill_task(subgraph, operation, &task);
+ task.input_banks = available_input_banks;
+ task.weights_banks = available_weights_banks;
+
+ task.top_slice = 0;
+ task.bottom_slice = available_slices - 1;
+
+ task.pad_top = pad_top;
+ task.pad_left = pad_left;
+ task.pad_right = pad_right;
+
+ util_dynarray_append(&operation->tasks, struct split_task, task);
+
+ for (unsigned slice = operation->weights_height - pad_top - 1;
+ slice < operation->input_height;) {
+ memset(&task, 0, sizeof(task));
+
+ struct split_task *prev_task = util_dynarray_element(
+ &operation->tasks, struct split_task,
+ util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
+
+ while (slice <= prev_task->bottom_slice) {
+ slice += operation->stride;
+ }
+ if (slice > prev_task->bottom_slice) {
+ slice -= operation->stride;
+ }
+
+ task.num = util_dynarray_num_elements(&operation->tasks, struct split_task);
+ fill_task(subgraph, operation, &task);
+ task.top_slice = MIN2(slice, prev_task->bottom_slice) -
+ (operation->weights_height - 1) + operation->stride;
+ task.bottom_slice = task.top_slice + available_slices - 1;
+ task.pad_left = pad_left;
+ task.pad_right = pad_right;
+
+ // check if current task is the last one
+ if (task.bottom_slice >= operation->input_height - 1) {
+ task.bottom_slice = operation->input_height - 1;
+ task.pad_bottom = pad_bottom;
+ util_dynarray_append(&operation->tasks, struct split_task, task);
+ break;
+ }
+
+ slice = task.top_slice + operation->weights_height - 1;
+ util_dynarray_append(&operation->tasks, struct split_task, task);
+ }
+
+ struct split_task *last_task = util_dynarray_element(
+ &operation->tasks, struct split_task,
+ util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
+ if (last_task->top_slice >= operation->input_height ||
+ last_task->bottom_slice >= (operation->input_height + pad_bottom)) {
+ (void)util_dynarray_pop_ptr(&operation->tasks, struct split_task);
+ }
+
+ // determine overlap slices between 2 split chunks
+ for (int i = 1;
+ i < util_dynarray_num_elements(&operation->tasks, struct split_task);
+ i++) {
+ struct split_task *prev_task =
+ util_dynarray_element(&operation->tasks, struct split_task, i - 1);
+ struct split_task *cur_task =
+ util_dynarray_element(&operation->tasks, struct split_task, i);
+
+ if (prev_task->bottom_slice >= cur_task->top_slice) {
+ cur_task->num_overlap_slices =
+ prev_task->bottom_slice - cur_task->top_slice + 1;
+ prev_task->num_retain_slices = cur_task->num_overlap_slices;
+ } else {
+ cur_task->num_overlap_slices = 0;
+ prev_task->num_retain_slices = 0;
+ }
+ }
+
+ unsigned output_height_processed = 0;
+ for (int i = 0;
+ i < util_dynarray_num_elements(&operation->tasks, struct split_task);
+ i++) {
+ struct split_task *cur_task =
+ util_dynarray_element(&operation->tasks, struct split_task, i);
+
+ unsigned slice = cur_task->top_slice + (operation->weights_height - 1) -
+ cur_task->pad_top;
+
+ while (slice <= cur_task->bottom_slice + cur_task->pad_bottom) {
+ slice += operation->stride;
+ cur_task->convolutions++;
+ }
+
+ cur_task->bottom_slice =
+ MIN2(cur_task->bottom_slice, operation->input_height - 1);
+
+ cur_task->input_height = cur_task->bottom_slice - cur_task->top_slice + 1;
+
+ cur_task->output_width = (cur_task->input_width + cur_task->pad_left +
+ cur_task->pad_right - operation->weights_width) /
+ operation->stride +
+ 1;
+ cur_task->output_height =
+ (cur_task->input_height + cur_task->pad_top + cur_task->pad_bottom -
+ operation->weights_height) /
+ operation->stride +
+ 1;
+ cur_task->atomic_count = cur_task->output_width * cur_task->output_height;
+
+ cur_task->input_offset =
+ calc_line_stride(operation->input_width) * cur_task->top_slice;
+ cur_task->output_offset =
+ calc_line_stride(operation->output_width) * output_height_processed;
+
+ cur_task->input_banks = available_input_banks;
+ cur_task->weights_banks = available_weights_banks;
+
+ output_height_processed += cur_task->output_height;
+ }
+}
diff --git a/src/gallium/drivers/rocket/rkt_task.h b/src/gallium/drivers/rocket/rkt_task.h
new file mode 100644
index 00000000000..84bb9aa577e
--- /dev/null
+++ b/src/gallium/drivers/rocket/rkt_task.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_TASK_H
+#define RKT_TASK_H
+
+#include "rkt_ml.h"
+
+void rkt_split_tasks(struct rkt_ml_subgraph *subgraph,
+ struct rkt_operation *operation);
+
+#endif /* RKT_TASK_H */ \ No newline at end of file
diff --git a/src/gallium/drivers/rocket/rules-ng.xsd b/src/gallium/drivers/rocket/rules-ng.xsd
new file mode 100644
index 00000000000..414dee1d746
--- /dev/null
+++ b/src/gallium/drivers/rocket/rules-ng.xsd
@@ -0,0 +1,457 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<schema xmlns="http://www.w3.org/2001/XMLSchema"
+ targetNamespace="http://nouveau.freedesktop.org/"
+ xmlns:rng="http://nouveau.freedesktop.org/"
+ elementFormDefault="qualified">
+
+ <annotation>
+ <documentation>
+ An updated version of the old rules.xml file from the
+ RivaTV project. Specifications by Pekka Paalanen,
+ preliminary attempt by KoalaBR,
+ first working version by Jakob Bornecrantz.
+ For specifications, see the file rules-ng-format.txt
+ in Nouveau CVS module 'rules-ng'.
+ </documentation>
+ <documentation>Version 0.1</documentation>
+ </annotation>
+
+
+ <!-- Elements -->
+
+ <element name="database" type="rng:databaseType" />
+ <element name="import" type="rng:importType" />
+ <element name="copyright" type="rng:copyrightType" />
+ <element name="domain" type="rng:domainType" />
+ <element name="group" type="rng:groupType" />
+ <element name="use-group" type="rng:refType" />
+ <element name="array" type="rng:arrayType" />
+ <element name="stripe" type="rng:stripeType" />
+ <element name="reg64" type="rng:registerType" />
+ <element name="reg32" type="rng:registerType" />
+ <element name="reg16" type="rng:registerType" />
+ <element name="reg8" type="rng:registerType" />
+ <element name="bitset" type="rng:bitsetType" />
+ <element name="bitfield" type="rng:bitfieldType" />
+ <element name="enum" type="rng:enumType" />
+ <element name="value" type="rng:valueType" />
+
+ <!-- Copyright elements -->
+ <element name="author" type="rng:authorType" />
+ <element name="nick" type="rng:nickType" />
+ <element name="license" type="rng:docType" />
+
+ <!-- Documentation elements -->
+
+ <!-- FIXME: allowed only one per parent element -->
+ <element name="brief" type="rng:briefType" />
+
+ <element name="doc" type="rng:docType" />
+ <element name="b" type="rng:textformatType" />
+ <element name="i" type="rng:textformatType" />
+ <element name="u" type="rng:textformatType" />
+ <element name="code" type="rng:textcodeType" />
+ <element name="ul" type="rng:listType" />
+ <element name="ol" type="rng:listType" />
+ <element name="li" type="rng:listitemType" />
+
+ <!-- Copyright element types -->
+
+ <complexType name="authorType" mixed="true">
+ <annotation>
+ <documentation>
+ register database author
+ </documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <element ref="rng:nick" />
+ </choice>
+ <attribute name="name" type="string" use="required" />
+ <attribute name="email" type="string" use="required" />
+ </complexType>
+
+ <complexType name="nickType">
+ <annotation>
+ <documentation>nickType</documentation>
+ </annotation>
+ <attribute name="name" type="string" use="required" />
+ </complexType>
+
+ <!-- Database element types -->
+
+ <complexType name="databaseType">
+ <annotation>
+ <documentation>databaseType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ </choice>
+ </complexType>
+
+ <complexType name="importType">
+ <annotation>
+ <documentation>importType</documentation>
+ </annotation>
+ <attribute name="file" type="string" use="required" />
+ </complexType>
+
+ <complexType name="copyrightType">
+ <annotation>
+ <documentation>copyrightType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ <element ref="rng:author" />
+ <element ref="rng:license" />
+ </choice>
+ <attribute name="year" type="nonNegativeInteger" use="optional" />
+ </complexType>
+
+ <complexType name="domainType">
+ <annotation>
+ <documentation>domainType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ <group ref="rng:regarrayGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ <attribute name="bare" type="rng:Boolean" use="optional" />
+ <attribute name="prefix" type="NMTOKENS" use="optional" />
+ <attribute name="width" type="rng:DomainWidth" use="optional" />
+ <attribute name="size" type="rng:HexOrNumber" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ <attribute name="variants" type="string" use="optional" />
+ </complexType>
+
+ <complexType name="groupType">
+ <annotation>
+ <documentation>groupType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ <group ref="rng:regarrayGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ </complexType>
+
+ <complexType name="arrayType">
+ <annotation>
+ <documentation>arrayType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ <group ref="rng:regarrayGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="optional" />
+ <attribute name="offset" type="rng:HexOrNumber" use="optional" />
+ <attribute name="offsets" type="string" use="optional"/>
+ <attribute name="doffsets" type="string" use="optional"/>
+ <attribute name="index" type="NMTOKENS" use="optional"/>
+ <attribute name="stride" type="rng:HexOrNumber" use="required" />
+ <attribute name="length" type="rng:HexOrNumber" use="required" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ <attribute name="variants" type="string" use="optional" />
+ <attribute name="usage" type="string" use="optional" />
+ </complexType>
+
+ <complexType name="stripeType">
+ <annotation>
+ <documentation>stripeType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ <group ref="rng:regarrayGroup" minOccurs="0" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="optional" />
+ <attribute name="offset" type="rng:HexOrNumber" use="optional" />
+ <attribute name="stride" type="rng:HexOrNumber" use="optional" />
+ <attribute name="length" type="rng:HexOrNumber" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ <attribute name="variants" type="string" use="optional" />
+ <attribute name="prefix" type="NMTOKENS" use="optional" />
+ </complexType>
+
+ <complexType name="registerType">
+ <annotation>
+ <documentation>
+ registerType used by reg8, reg16, reg32, reg64
+ </documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ <element ref="rng:value" />
+ <element ref="rng:bitfield" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ <attribute name="offset" type="rng:HexOrNumber" use="required" />
+ <attribute name="access" type="rng:Access" default="rw" use="optional" />
+ <attribute name="type" type="NMTOKENS" use="optional" />
+ <attribute name="shr" type="nonNegativeInteger" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ <attribute name="variants" type="string" use="optional" />
+ <attribute name="stride" type="rng:HexOrNumber" use="optional" />
+ <attribute name="length" type="rng:HexOrNumber" use="optional" />
+ <attribute name="high" type="nonNegativeInteger" use="optional" />
+ <attribute name="low" type="nonNegativeInteger" use="optional" />
+ <attribute name="pos" type="nonNegativeInteger" use="optional" />
+ <attribute name="align" type="nonNegativeInteger" use="optional" />
+ <attribute name="radix" type="nonNegativeInteger" use="optional" />
+ <attribute name="usage" type="string" use="optional" />
+ </complexType>
+
+ <complexType name="bitsetType">
+ <annotation>
+ <documentation>bitsetType</documentation>
+ </annotation>
+ <choice maxOccurs="unbounded">
+ <element ref="rng:bitfield" />
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ <attribute name="inline" type="rng:Boolean" use="optional" />
+ <attribute name="bare" type="rng:Boolean" use="optional" />
+ <attribute name="prefix" type="NMTOKENS" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ </complexType>
+
+ <complexType name="bitfieldType">
+ <annotation>
+ <documentation>bitfieldType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <element ref="rng:value" maxOccurs="unbounded" />
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ <attribute name="high" type="nonNegativeInteger" use="optional" />
+ <attribute name="low" type="nonNegativeInteger" use="optional" />
+ <attribute name="pos" type="nonNegativeInteger" use="optional" />
+ <attribute name="radix" type="nonNegativeInteger" use="optional" />
+ <attribute name="align" type="nonNegativeInteger" use="optional" />
+ <attribute name="type" type="NMTOKENS" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ <attribute name="variants" type="string" use="optional" />
+ <attribute name="addvariant" type="rng:Boolean" use="optional" />
+ <attribute name="shr" type="nonNegativeInteger" use="optional" />
+ </complexType>
+
+ <complexType name="enumType">
+ <annotation>
+ <documentation>enumType</documentation>
+ </annotation>
+ <choice maxOccurs="unbounded">
+ <element ref="rng:value" />
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ <attribute name="inline" type="rng:Boolean" use="optional" />
+ <attribute name="bare" type="rng:Boolean" use="optional" />
+ <attribute name="prefix" type="NMTOKENS" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ </complexType>
+
+ <complexType name="valueType">
+ <annotation>
+ <documentation>valueType</documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:docGroup" />
+ <group ref="rng:topGroup" />
+ </choice>
+ <attribute name="name" type="NMTOKEN" use="required" />
+ <attribute name="value" type="string" use="optional" />
+ <attribute name="varset" type="NMTOKEN" use="optional" />
+ <attribute name="variants" type="string" use="optional" />
+ </complexType>
+
+ <complexType name="refType">
+ <annotation>
+ <documentation>refType</documentation>
+ </annotation>
+ <attribute name="ref" type="NMTOKEN" use="required" />
+ </complexType>
+
+
+ <!-- Documentation element types -->
+
+ <complexType name="briefType">
+ <annotation>
+ <documentation>
+ brief documentation, no markup
+ </documentation>
+ </annotation>
+ <simpleContent>
+ <extension base="string" />
+ </simpleContent>
+ </complexType>
+
+ <complexType name="docType" mixed="true">
+ <annotation>
+ <documentation>
+ root element of documentation sub-tree
+ </documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:textformatGroup" />
+ <group ref="rng:listGroup" />
+ <element ref="rng:code" />
+ </choice>
+ </complexType>
+
+ <complexType name="textformatType" mixed="true">
+ <annotation>
+ <documentation>
+ for bold, underline, italics
+ </documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:textformatGroup" />
+ </choice>
+ </complexType>
+
+ <complexType name="textcodeType">
+ <simpleContent>
+ <extension base="string">
+ <attribute name="title" type="string" />
+ </extension>
+ </simpleContent>
+ </complexType>
+
+ <complexType name="listType">
+ <annotation>
+ <documentation>
+ definition of a list, ordered or unordered
+ </documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <element ref="rng:li" />
+ </choice>
+ </complexType>
+
+ <complexType name="listitemType" mixed="true">
+ <annotation>
+ <documentation>
+ items of a list
+ </documentation>
+ </annotation>
+ <choice minOccurs="0" maxOccurs="unbounded">
+ <group ref="rng:textformatGroup" />
+ <group ref="rng:listGroup" />
+ <element ref="rng:code" />
+ </choice>
+ </complexType>
+
+
+
+ <!-- Attribute value types -->
+
+ <simpleType name="Hexadecimal">
+ <restriction base="string">
+ <pattern value="0x[0-9a-f]+" />
+ <pattern value="0x[0-9A-F]+" />
+ <pattern value="[0-9]" />
+ </restriction>
+ </simpleType>
+
+ <simpleType name="HexOrNumber">
+ <annotation>
+ <documentation>HexOrNumber</documentation>
+ </annotation>
+ <union memberTypes="rng:Hexadecimal nonNegativeInteger" />
+ </simpleType>
+
+ <simpleType name="Boolean">
+ <restriction base="string">
+ <enumeration value="true" />
+ <enumeration value="1" />
+ <enumeration value="yes" />
+ <enumeration value="false" />
+ <enumeration value="0" />
+ <enumeration value="no" />
+ </restriction>
+ </simpleType>
+
+ <simpleType name="Access">
+ <annotation>
+ <documentation>Access</documentation>
+ </annotation>
+ <restriction base="string">
+ <enumeration value="r" />
+ <enumeration value="w" />
+ <enumeration value="rw" />
+ </restriction>
+ </simpleType>
+
+ <simpleType name="DomainWidth">
+ <annotation>
+ <documentation>DomainWidth</documentation>
+ </annotation>
+ <restriction base="string">
+ <enumeration value="8" />
+ <enumeration value="16" />
+ <enumeration value="32" />
+ <enumeration value="64" />
+ </restriction>
+ </simpleType>
+
+
+
+ <!-- Element groups -->
+
+ <group name="topGroup">
+ <choice>
+ <element ref="rng:copyright" />
+ <element ref="rng:domain" />
+ <element ref="rng:enum" />
+ <element ref="rng:group" />
+ <element ref="rng:bitset" />
+ <element ref="rng:import" />
+ </choice>
+ </group>
+
+ <group name="regarrayGroup">
+ <choice>
+ <element ref="rng:reg64" />
+ <element ref="rng:reg32" />
+ <element ref="rng:reg16" />
+ <element ref="rng:reg8" />
+ <element ref="rng:array" />
+ <element ref="rng:stripe" />
+ <element ref="rng:use-group" />
+ </choice>
+ </group>
+
+ <group name="docGroup">
+ <choice>
+ <element ref="rng:brief" />
+ <element ref="rng:doc" />
+ </choice>
+ </group>
+
+ <group name="textformatGroup">
+ <choice>
+ <element ref="rng:b" />
+ <element ref="rng:i" />
+ <element ref="rng:u" />
+ </choice>
+ </group>
+
+ <group name="listGroup">
+ <choice>
+ <element ref="rng:ul" />
+ <element ref="rng:ol" />
+ </choice>
+ </group>
+
+</schema>
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index c26e98e6f54..6ba60851984 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -185,6 +185,12 @@ if with_gallium_lima
else
driver_lima = declare_dependency()
endif
+if with_gallium_rocket
+ subdir('winsys/rocket/drm')
+ subdir('drivers/rocket')
+else
+ driver_rocket = declare_dependency()
+endif
if with_gallium_zink
subdir('drivers/zink')
else
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index 5ecdc7be6e7..6d3ccafc63e 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -62,7 +62,7 @@ libgallium_dri = shared_library(
driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
driver_tegra, driver_i915, driver_svga, driver_virgl,
driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
- driver_asahi, driver_crocus
+ driver_asahi, driver_crocus, driver_rocket
],
install : true,
name_suffix : libname_suffix,
diff --git a/src/gallium/targets/dril/meson.build b/src/gallium/targets/dril/meson.build
index 556047c2638..80cbef3f039 100644
--- a/src/gallium/targets/dril/meson.build
+++ b/src/gallium/targets/dril/meson.build
@@ -124,7 +124,8 @@ foreach d : [[with_gallium_kmsro, [
[with_gallium_lima, 'lima_dri.so'],
[with_gallium_d3d12, 'd3d12_dri.so'],
[with_gallium_zink, 'zink_dri.so'],
- [with_gallium_asahi, 'asahi_dri.so']]
+ [with_gallium_asahi, 'asahi_dri.so'],
+ [with_gallium_rocket, 'rocket_dri.so']]
if d[0]
dril_drivers += d[1]
endif
diff --git a/src/gallium/winsys/rocket/drm/meson.build b/src/gallium/winsys/rocket/drm/meson.build
new file mode 100644
index 00000000000..55f65803810
--- /dev/null
+++ b/src/gallium/winsys/rocket/drm/meson.build
@@ -0,0 +1,13 @@
+# Copyright 2017 Broadcom
+# SPDX-License-Identifier: MIT
+
+librocketwinsys = static_library(
+ 'rocketwinsys',
+ files('rkt_drm_winsys.c'),
+ include_directories : [
+ inc_src, inc_include,
+ inc_gallium, inc_gallium_aux, inc_gallium_drivers,
+ ],
+ gnu_symbol_visibility : 'hidden',
+ dependencies: [idep_mesautil],
+)
diff --git a/src/gallium/winsys/rocket/drm/rkt_drm_public.h b/src/gallium/winsys/rocket/drm/rkt_drm_public.h
new file mode 100644
index 00000000000..5138801758f
--- /dev/null
+++ b/src/gallium/winsys/rocket/drm/rkt_drm_public.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __RKT_DRM_PUBLIC_H__
+#define __RKT_DRM_PUBLIC_H__
+
+struct pipe_screen;
+struct pipe_screen_config;
+
+struct pipe_screen *
+rkt_drm_screen_create(int drmFD, const struct pipe_screen_config *config);
+
+#endif /* __RKT_DRM_PUBLIC_H__ */
diff --git a/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c b/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c
new file mode 100644
index 00000000000..cbba3534e1a
--- /dev/null
+++ b/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/os_file.h"
+#include "util/u_screen.h"
+
+#include "rocket/rkt_device.h"
+#include "rkt_drm_public.h"
+
+struct pipe_screen *
+rkt_drm_screen_create(int fd, const struct pipe_screen_config *config)
+{
+ return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL,
+ rkt_screen_create);
+}