diff options
33 files changed, 6064 insertions, 7 deletions
diff --git a/.clang-format-include b/.clang-format-include index d40db0e1d0b..9f2d1dd1977 100644 --- a/.clang-format-include +++ b/.clang-format-include @@ -3,6 +3,7 @@ src/gallium/drivers/i915 src/gallium/drivers/r300/compiler/* +src/gallium/drivers/rocket/**/* src/gallium/targets/teflon/**/* src/gallium/frontends/teflon/**/* src/amd/vulkan/**/* diff --git a/docs/teflon.rst b/docs/teflon.rst index 3935207ce39..ffed65cf759 100644 --- a/docs/teflon.rst +++ b/docs/teflon.rst @@ -15,6 +15,9 @@ Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate * - Etnaviv - ``VeriSilicon VIPNano-SI+.8002`` - ``NXP iMX8M Plus on Toradex Verdin SoM`` + * - Rocket + - ``RK3588 NPU`` + - ``PINE64 QuartzPro64`` .. list-table:: Tested models :header-rows: 1 @@ -25,29 +28,33 @@ Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate - Status - Inference speed on AML-A311D-CC Alta - Inference speed on Verdin iMX8M Plus + - Inference speed on QuartzPro64 * - MobileNet V1 - UINT8 - http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz - Fully supported - ~6.6 ms - ~7.9 ms + - ~18 ms * - MobileNet V2 - UINT8 - https://storage.googleapis.com/mobilenet_v2/checkpoints/quantized_v2_224_100.tgz - Fully supported - ~6.9 ms - ~8.0 ms + - ~21 ms * - SSDLite MobileDet - UINT8 - https://raw.githubusercontent.com/google-coral/test_data/master/ssdlite_mobiledet_coco_qat_postprocess.tflite - Fully supported - ~24.8 ms - ~24.4 ms + - ~48 ms Build ----- -Build Mesa as usual, with the -Dteflon=true argument. +Build Mesa as usual, with the -Dteflon=true argument. Make sure at least one of etnaviv or rocket gallium drivers is enabled, as Teflon only works with these drivers. Example instructions: @@ -62,7 +69,7 @@ Example instructions: # Build Mesa ~ $ cd mesa - mesa $ meson setup build -Dgallium-drivers=etnaviv -Dvulkan-drivers= -Dteflon=true + mesa $ meson setup build -Dgallium-drivers=etnaviv,rocket -Dvulkan-drivers= -Dteflon=true mesa $ meson compile -C build Install runtime dependencies @@ -99,7 +106,7 @@ This example script has been based from the code in https://github.com/tensorflo ~ $ cd mesa/ mesa $ TEFLON_DEBUG=verbose ETNA_MESA_DEBUG=ml_dbgs python3.10 src/gallium/frontends/teflon/tests/classification.py \ -i ~/tensorflow/assets/grace_hopper.bmp \ - -m src/gallium/targets/teflon/tests/mobilenet_v1_1.0_224_quant.tflite \ + -m src/gallium/targets/teflon/tests/models/mobilenetv1/mobilenet_v1_1_224_quant.tflite \ -l src/gallium/frontends/teflon/tests/labels_mobilenet_quant_v1_224.txt \ -e build/src/gallium/targets/teflon/libteflon.so diff --git a/include/drm-uapi/rknpu_ioctl.h b/include/drm-uapi/rknpu_ioctl.h new file mode 100644 index 00000000000..54d79636b16 --- /dev/null +++ b/include/drm-uapi/rknpu_ioctl.h @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) Fuzhou Rockchip Electronics Co.Ltd + * Author: Felix Zeng <felix.zeng@rock-chips.com> + */ + +#ifndef __LINUX_RKNPU_IOCTL_H +#define __LINUX_RKNPU_IOCTL_H + +#include <linux/ioctl.h> +#include <linux/types.h> + +#if !defined(__KERNEL__) +#define __user +#endif + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +#define RKNPU_OFFSET_VERSION 0x0 +#define RKNPU_OFFSET_PC_OP_EN 0x8 +#define RKNPU_OFFSET_PC_DATA_ADDR 0x10 +#define RKNPU_OFFSET_PC_DATA_AMOUNT 0x14 +#define RKNPU_OFFSET_PC_TASK_CONTROL 0x30 +#define RKNPU_OFFSET_PC_DMA_BASE_ADDR 0x34 +#define RKNPU_OFFSET_PC_TASK_STATUS 0x3c + +#define RKNPU_OFFSET_INT_MASK 0x20 +#define RKNPU_OFFSET_INT_CLEAR 0x24 +#define RKNPU_OFFSET_INT_STATUS 0x28 +#define RKNPU_OFFSET_INT_RAW_STATUS 0x2c + +#define RKNPU_OFFSET_CLR_ALL_RW_AMOUNT 0x8010 +#define RKNPU_OFFSET_DT_WR_AMOUNT 0x8034 +#define RKNPU_OFFSET_DT_RD_AMOUNT 0x8038 +#define RKNPU_OFFSET_WT_RD_AMOUNT 0x803c + +#define RKNPU_OFFSET_ENABLE_MASK 0xf008 + +#define RKNPU_INT_CLEAR 0x1ffff + +#define RKNPU_PC_DATA_EXTRA_AMOUNT 4 + +#define RKNPU_STR_HELPER(x) #x + +#define RKNPU_GET_DRV_VERSION_STRING(MAJOR, MINOR, PATCHLEVEL) \ + RKNPU_STR_HELPER(MAJOR) \ + "." RKNPU_STR_HELPER(MINOR) "." RKNPU_STR_HELPER(PATCHLEVEL) +#define RKNPU_GET_DRV_VERSION_CODE(MAJOR, MINOR, PATCHLEVEL) \ + (MAJOR * 10000 + MINOR * 100 + PATCHLEVEL) +#define RKNPU_GET_DRV_VERSION_MAJOR(CODE) (CODE / 10000) +#define RKNPU_GET_DRV_VERSION_MINOR(CODE) ((CODE % 10000) / 100) +#define RKNPU_GET_DRV_VERSION_PATCHLEVEL(CODE) (CODE % 100) + +/* memory type definitions. */ +enum e_rknpu_mem_type { + /* physically continuous memory and used as default. */ + RKNPU_MEM_CONTIGUOUS = 0 << 0, + /* physically non-continuous memory. */ + RKNPU_MEM_NON_CONTIGUOUS = 1 << 0, + /* non-cacheable mapping and used as default. */ + RKNPU_MEM_NON_CACHEABLE = 0 << 1, + /* cacheable mapping. */ + RKNPU_MEM_CACHEABLE = 1 << 1, + /* write-combine mapping. */ + RKNPU_MEM_WRITE_COMBINE = 1 << 2, + /* dma attr kernel mapping */ + RKNPU_MEM_KERNEL_MAPPING = 1 << 3, + /* iommu mapping */ + RKNPU_MEM_IOMMU = 1 << 4, + /* zero mapping */ + RKNPU_MEM_ZEROING = 1 << 5, + /* allocate secure buffer */ + RKNPU_MEM_SECURE = 1 << 6, + /* allocate from non-dma32 zone */ + RKNPU_MEM_NON_DMA32 = 1 << 7, + RKNPU_MEM_MASK = RKNPU_MEM_NON_CONTIGUOUS | RKNPU_MEM_CACHEABLE | + RKNPU_MEM_WRITE_COMBINE | RKNPU_MEM_KERNEL_MAPPING | + RKNPU_MEM_IOMMU | RKNPU_MEM_ZEROING | + RKNPU_MEM_SECURE | RKNPU_MEM_NON_DMA32 +}; + +/* sync mode definitions. */ +enum e_rknpu_mem_sync_mode { + RKNPU_MEM_SYNC_TO_DEVICE = 1 << 0, + RKNPU_MEM_SYNC_FROM_DEVICE = 1 << 1, + RKNPU_MEM_SYNC_MASK = + RKNPU_MEM_SYNC_TO_DEVICE | RKNPU_MEM_SYNC_FROM_DEVICE +}; + +/* job mode definitions. */ +enum e_rknpu_job_mode { + RKNPU_JOB_SLAVE = 0 << 0, + RKNPU_JOB_PC = 1 << 0, + RKNPU_JOB_BLOCK = 0 << 1, + RKNPU_JOB_NONBLOCK = 1 << 1, + RKNPU_JOB_PINGPONG = 1 << 2, + RKNPU_JOB_FENCE_IN = 1 << 3, + RKNPU_JOB_FENCE_OUT = 1 << 4, + RKNPU_JOB_MASK = RKNPU_JOB_PC | RKNPU_JOB_NONBLOCK | + RKNPU_JOB_PINGPONG | RKNPU_JOB_FENCE_IN | + RKNPU_JOB_FENCE_OUT +}; + +/* action definitions */ +enum e_rknpu_action { + RKNPU_GET_HW_VERSION = 0, + RKNPU_GET_DRV_VERSION = 1, + RKNPU_GET_FREQ = 2, + RKNPU_SET_FREQ = 3, + RKNPU_GET_VOLT = 4, + RKNPU_SET_VOLT = 5, + RKNPU_ACT_RESET = 6, + RKNPU_GET_BW_PRIORITY = 7, + RKNPU_SET_BW_PRIORITY = 8, + RKNPU_GET_BW_EXPECT = 9, + RKNPU_SET_BW_EXPECT = 10, + RKNPU_GET_BW_TW = 11, + RKNPU_SET_BW_TW = 12, + RKNPU_ACT_CLR_TOTAL_RW_AMOUNT = 13, + RKNPU_GET_DT_WR_AMOUNT = 14, + RKNPU_GET_DT_RD_AMOUNT = 15, + RKNPU_GET_WT_RD_AMOUNT = 16, + RKNPU_GET_TOTAL_RW_AMOUNT = 17, + RKNPU_GET_IOMMU_EN = 18, + RKNPU_SET_PROC_NICE = 19, + RKNPU_POWER_ON = 20, + RKNPU_POWER_OFF = 21, +}; + +/** + * User-desired buffer creation information structure. + * + * @handle: The handle of the created GEM object. + * @flags: user request for setting memory type or cache attributes. + * @size: user-desired memory allocation size. + * - this size value would be page-aligned internally. + * @obj_addr: address of RKNPU memory object. + * @dma_addr: dma address that access by rknpu. + */ +struct rknpu_mem_create { + __u32 handle; + __u32 flags; + __u64 size; + __u64 obj_addr; + __u64 dma_addr; +}; + +/** + * A structure for getting a fake-offset that can be used with mmap. + * + * @handle: handle of gem object. + * @reserved: just padding to be 64-bit aligned. + * @offset: a fake-offset of gem object. + */ +struct rknpu_mem_map { + __u32 handle; + __u32 reserved; + __u64 offset; +}; + +/** + * For destroying DMA buffer + * + * @handle: handle of the buffer. + * @reserved: reserved for padding. + * @obj_addr: rknpu_mem_object addr. + */ +struct rknpu_mem_destroy { + __u32 handle; + __u32 reserved; + __u64 obj_addr; +}; + +/** + * For synchronizing DMA buffer + * + * @flags: user request for setting memory type or cache attributes. + * @reserved: reserved for padding. + * @obj_addr: address of RKNPU memory object. + * @offset: offset in bytes from start address of buffer. + * @size: size of memory region. + * + */ +struct rknpu_mem_sync { + __u32 flags; + __u32 reserved; + __u64 obj_addr; + __u64 offset; + __u64 size; +}; + +/** + * struct rknpu_task structure for task information + * + * @flags: flags for task + * @op_idx: operator index + * @enable_mask: enable mask + * @int_mask: interrupt mask + * @int_clear: interrupt clear + * @int_status: interrupt status + * @regcfg_amount: register config number + * @regcfg_offset: offset for register config + * @regcmd_addr: address for register command + * + */ +struct rknpu_task { + __u32 flags; + __u32 op_idx; + __u32 enable_mask; + __u32 int_mask; + __u32 int_clear; + __u32 int_status; + __u32 regcfg_amount; + __u32 regcfg_offset; + __u64 regcmd_addr; +} __packed; + +/** + * struct rknpu_subcore_task structure for subcore task index + * + * @task_start: task start index + * @task_number: task number + * + */ +struct rknpu_subcore_task { + __u32 task_start; + __u32 task_number; +}; + +/** + * struct rknpu_submit structure for job submit + * + * @flags: flags for job submit + * @timeout: submit timeout + * @task_start: task start index + * @task_number: task number + * @task_counter: task counter + * @priority: submit priority + * @task_obj_addr: address of task object + * @regcfg_obj_addr: address of register config object + * @task_base_addr: task base address + * @user_data: (optional) user data + * @core_mask: core mask of rknpu + * @fence_fd: dma fence fd + * @subcore_task: subcore task + * + */ +struct rknpu_submit { + __u32 flags; + __u32 timeout; + __u32 task_start; + __u32 task_number; + __u32 task_counter; + __s32 priority; + __u64 task_obj_addr; + __u64 regcfg_obj_addr; + __u64 task_base_addr; + __u64 user_data; + __u32 core_mask; + __s32 fence_fd; + struct rknpu_subcore_task subcore_task[5]; +}; + +/** + * struct rknpu_task structure for action (GET, SET or ACT) + * + * @flags: flags for action + * @value: GET or SET value + * + */ +struct rknpu_action { + __u32 flags; + __u32 value; +}; + +#define RKNPU_ACTION 0x00 +#define RKNPU_SUBMIT 0x01 +#define RKNPU_MEM_CREATE 0x02 +#define RKNPU_MEM_MAP 0x03 +#define RKNPU_MEM_DESTROY 0x04 +#define RKNPU_MEM_SYNC 0x05 + +#define RKNPU_IOC_MAGIC 'r' +#define RKNPU_IOW(nr, type) _IOW(RKNPU_IOC_MAGIC, nr, type) +#define RKNPU_IOR(nr, type) _IOR(RKNPU_IOC_MAGIC, nr, type) +#define RKNPU_IOWR(nr, type) _IOWR(RKNPU_IOC_MAGIC, nr, type) + +#include <drm.h> + +#define DRM_IOCTL_RKNPU_ACTION \ + DRM_IOWR(DRM_COMMAND_BASE + RKNPU_ACTION, struct rknpu_action) +#define DRM_IOCTL_RKNPU_SUBMIT \ + DRM_IOWR(DRM_COMMAND_BASE + RKNPU_SUBMIT, struct rknpu_submit) +#define DRM_IOCTL_RKNPU_MEM_CREATE \ + DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_CREATE, struct rknpu_mem_create) +#define DRM_IOCTL_RKNPU_MEM_MAP \ + DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_MAP, struct rknpu_mem_map) +#define DRM_IOCTL_RKNPU_MEM_DESTROY \ + DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_DESTROY, struct rknpu_mem_destroy) +#define DRM_IOCTL_RKNPU_MEM_SYNC \ + DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_SYNC, struct rknpu_mem_sync) + +#define IOCTL_RKNPU_ACTION RKNPU_IOWR(RKNPU_ACTION, struct rknpu_action) +#define IOCTL_RKNPU_SUBMIT RKNPU_IOWR(RKNPU_SUBMIT, struct rknpu_submit) +#define IOCTL_RKNPU_MEM_CREATE \ + RKNPU_IOWR(RKNPU_MEM_CREATE, struct rknpu_mem_create) +#define IOCTL_RKNPU_MEM_MAP RKNPU_IOWR(RKNPU_MEM_MAP, struct rknpu_mem_map) +#define IOCTL_RKNPU_MEM_DESTROY \ + RKNPU_IOWR(RKNPU_MEM_DESTROY, struct rknpu_mem_destroy) +#define IOCTL_RKNPU_MEM_SYNC RKNPU_IOWR(RKNPU_MEM_SYNC, struct rknpu_mem_sync) + +#endif diff --git a/include/drm-uapi/rocket_accel.h b/include/drm-uapi/rocket_accel.h new file mode 100644 index 00000000000..14b2e12b7c4 --- /dev/null +++ b/include/drm-uapi/rocket_accel.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Tomeu Vizoso + */ +#ifndef __DRM_UAPI_ROCKET_ACCEL_H__ +#define __DRM_UAPI_ROCKET_ACCEL_H__ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_ROCKET_CREATE_BO 0x00 +#define DRM_ROCKET_SUBMIT 0x01 +#define DRM_ROCKET_PREP_BO 0x02 +#define DRM_ROCKET_FINI_BO 0x03 + +#define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo) +#define DRM_IOCTL_ROCKET_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit) +#define DRM_IOCTL_ROCKET_PREP_BO DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_PREP_BO, struct drm_rocket_prep_bo) +#define DRM_IOCTL_ROCKET_FINI_BO DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_FINI_BO, struct drm_rocket_fini_bo) + +/** + * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs. + * + */ +struct drm_rocket_create_bo { + /** Input: Size of the requested BO. */ + __u32 size; + + /** Output: GEM handle for the BO. */ + __u32 handle; + + /** + * Output: DMA address for the BO in the NPU address space. This address + * is private to the DRM fd and is valid for the lifetime of the GEM + * handle. + */ + __u64 dma_address; + + /** Output: Offset into the drm node to use for subsequent mmap call. */ + __u64 offset; +}; + +/** + * struct drm_rocket_prep_bo - ioctl argument for starting CPU ownership of the BO. + * + * Takes care of waiting for any NPU jobs that might still use the NPU and performs cache + * synchronization. + */ +struct drm_rocket_prep_bo { + /** Input: GEM handle of the buffer object. */ + __u32 handle; + + /** Reserved, must be zero. */ + __u32 reserved; + + /** Input: Amount of time to wait for NPU jobs. */ + __s64 timeout_ns; +}; + +/** + * struct drm_rocket_fini_bo - ioctl argument for finishing CPU ownership of the BO. + * + * Synchronize caches for NPU access. + */ +struct drm_rocket_fini_bo { + /** Input: GEM handle of the buffer object. */ + __u32 handle; + + /** Reserved, must be zero. */ + __u32 reserved; +}; + +/** + * struct drm_rocket_task - A task to be run on the NPU + * + * A task is the smallest unit of work that can be run on the NPU. + */ +struct drm_rocket_task { + /** Input: DMA address to NPU mapping of register command buffer */ + __u32 regcmd; + + /** Input: Number of commands in the register command buffer */ + __u32 regcmd_count; +}; + +/** + * struct drm_rocket_job - A job to be run on the NPU + * + * The kernel will schedule the execution of this job taking into account its + * dependencies with other jobs. All tasks in the same job will be executed + * sequentially on the same core, to benefit from memory residency in SRAM. + */ +struct drm_rocket_job { + /** Input: Pointer to an array of struct drm_rocket_task. */ + __u64 tasks; + + /** Input: Pointer to a u32 array of the BOs that are read by the job. */ + __u64 in_bo_handles; + + /** Input: Pointer to a u32 array of the BOs that are written to by the job. */ + __u64 out_bo_handles; + + /** Input: Number of tasks passed in. */ + __u32 task_count; + + /** Input: Size in bytes of the structs in the @tasks field. */ + __u32 task_struct_size; + + /** Input: Number of input BO handles passed in (size is that times 4). */ + __u32 in_bo_handle_count; + + /** Input: Number of output BO handles passed in (size is that times 4). */ + __u32 out_bo_handle_count; +}; + +/** + * struct drm_rocket_submit - ioctl argument for submitting commands to the NPU. + * + * The kernel will schedule the execution of these jobs in dependency order. + */ +struct drm_rocket_submit { + /** Input: Pointer to an array of struct drm_rocket_job. */ + __u64 jobs; + + /** Input: Number of jobs passed in. */ + __u32 job_count; + + /** Input: Size in bytes of the structs in the @jobs field. */ + __u32 job_struct_size; + + /** Reserved, must be zero. */ + __u64 reserved; +}; + +#if defined(__cplusplus) +} +#endif + +#endif /* __DRM_UAPI_ROCKET_ACCEL_H__ */ diff --git a/meson.build b/meson.build index d1a5cc2cb4c..3be192356ec 100644 --- a/meson.build +++ b/meson.build @@ -181,7 +181,7 @@ elif gallium_drivers.contains('all') gallium_drivers = [ 'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915', 'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris', - 'zink', 'd3d12', 'asahi' + 'zink', 'd3d12', 'asahi', 'rocket' ] endif @@ -208,6 +208,7 @@ with_gallium_lima = gallium_drivers.contains('lima') with_gallium_zink = gallium_drivers.contains('zink') with_gallium_d3d12 = gallium_drivers.contains('d3d12') with_gallium_asahi = gallium_drivers.contains('asahi') +with_gallium_rocket = gallium_drivers.contains('rocket') foreach gallium_driver : gallium_drivers pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper()) endforeach diff --git a/meson.options b/meson.options index cd0e56cc429..51a644ad310 100644 --- a/meson.options +++ b/meson.options @@ -82,7 +82,7 @@ option( 'all', 'auto', 'asahi', 'crocus', 'd3d12', 'etnaviv', 'freedreno', 'i915', 'iris', 'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi', - 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink', + 'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink', ], description : 'List of gallium drivers to build. If this is set to auto ' + 'all drivers applicable to the target OS/architecture ' + diff --git a/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt b/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt new file mode 100644 index 00000000000..a86f73b27a7 --- /dev/null +++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt @@ -0,0 +1,126 @@ +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail +Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail +Conv2D.Op/input_size_112_weight_size_1_input_channels_120_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail + +Models.Op/mobiledet_086,Fail +Models.Op/mobiledet_087,Fail +Models.Op/mobiledet_ssdlite_mobiledet_coco_qat_postprocess,Fail + +Models.Op/yolox_005,Fail +Models.Op/yolox_007,Fail +Models.Op/yolox_008,Fail +Models.Op/yolox_009,Fail +Models.Op/yolox_010,Fail +Models.Op/yolox_012,Fail +Models.Op/yolox_014,Fail +Models.Op/yolox_016,Fail +Models.Op/yolox_018,Fail +Models.Op/yolox_019,Fail +Models.Op/yolox_021,Fail +Models.Op/yolox_022,Fail +Models.Op/yolox_024,Fail +Models.Op/yolox_025,Fail +Models.Op/yolox_031,Fail +Models.Op/yolox_034,Fail +Models.Op/yolox_037,Fail +Models.Op/yolox_040,Fail +Models.Op/yolox_046,Fail +Models.Op/yolox_055,Fail +Models.Op/yolox_064,Fail +Models.Op/yolox_072,Fail +Models.Op/yolox_073,Fail +Models.Op/yolox_078,Fail +Models.Op/yolox_082,Fail +Models.Op/yolox_087,Fail +Models.Op/yolox_091,Fail +Models.Op/yolox_096,Fail +Models.Op/yolox_097,Fail +Models.Op/yolox_100,Fail +Models.Op/yolox_101,Fail +Models.Op/yolox_107,Fail +Models.Op/yolox_108,Fail +Models.Op/yolox_111,Fail +Models.Op/yolox_112,Fail +Models.Op/yolox_118,Fail +Models.Op/yolox_119,Fail +Models.Op/yolox_122,Fail +Models.Op/yolox_123,Fail +Models.Op/yolox_yolox,Fail
\ No newline at end of file diff --git a/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt b/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt new file mode 100644 index 00000000000..52f9ab3b05c --- /dev/null +++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt @@ -0,0 +1,5 @@ +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_1_padding_same_0_is_signed_0 +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0 +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_1_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_2_padding_same_1_is_signed_0 +Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0
\ No newline at end of file diff --git a/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt b/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt new file mode 100644 index 00000000000..06fc099d2a1 --- /dev/null +++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt @@ -0,0 +1,29 @@ +Add.Op/.* +AddQuant.Op/.* +Conv2D.Op/.* +DepthwiseConv2D.Op/.* +FullyConnected.Op/.* + +# These tests below (adds) aren't well constructed and thus fail in TF +Models.Op/mobiledet_008 +Models.Op/mobiledet_011 +Models.Op/mobiledet_014 +Models.Op/mobiledet_019 +Models.Op/mobiledet_022 +Models.Op/mobiledet_025 +Models.Op/mobiledet_032 +Models.Op/mobiledet_035 +Models.Op/mobiledet_038 +Models.Op/mobiledet_045 +Models.Op/mobiledet_049 +Models.Op/mobiledet_053 +Models.Op/mobiledet_060 +Models.Op/mobiledet_064 +Models.Op/mobiledet_068 +Models.Op/yolox_011 +Models.Op/yolox_020 +Models.Op/yolox_023 +Models.Op/yolox_026 +Models.Op/yolox_035 +Models.Op/yolox_038 +Models.Op/yolox_041 diff --git a/src/gallium/drivers/rocket/decode.py b/src/gallium/drivers/rocket/decode.py new file mode 100644 index 00000000000..6bc4a5780c8 --- /dev/null +++ b/src/gallium/drivers/rocket/decode.py @@ -0,0 +1,75 @@ +#!/usr/bin/python3 +# +# Copyright © 2024-2025 Tomeu Vizoso +# +# SPDX-License-Identifier: MIT + +import sys +import os +import argparse +import struct +from gen_parser import Parser, Reg, Enum, mask, Error + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--xml', type=str, required=True) + parser.add_argument('--dump', type=str, required=True) + + args = parser.parse_args() + + p = Parser() + + try: + p.parse("", args.xml) + except Error as e: + print(e, file=sys.stderr) + exit(1) + + regs = {} + for e in p.file: + if isinstance(e, Reg): + regs[e.offset] = e + + domains = {} + for e in p.file: + if isinstance(e, Enum): + if e.name == "target": + for name, val in e.values: + domains[name] = val + + f = open(args.dump, mode='rb') + for i in range(0, os.path.getsize(args.dump) // 8): + cmd = f.read(8) + (offset, value, target) = struct.unpack("<hIh", cmd) + if offset in regs.keys(): + reg = regs[offset] + + if (target & 0xfffffffe) != domains[reg.domain]: + print("WARNING: target 0x%x doesn't match register's domain 0x%x" % (target, domains[reg.domain])) + + print("EMIT(REG_%s, " % regs[offset].full_name.upper(), end="") + first = True + if value == 0 or len(reg.bitset.fields) == 1: + print("0x%x" % value, end="") + else: + for field in reg.bitset.fields: + if field.type == "boolean": + if 1 << field.high & value: + if not first: + print(" | ", end="") + print("%s_%s" % (reg.full_name.upper(), field.name.upper()), end="") + first = False + elif field.type == "uint": + field_value = (value & mask(field.low, field.high)) >> field.low + if field_value != 0: + if not first: + print(" | ", end="") + print("%s_%s(%d)" % (reg.full_name.upper(), field.name.upper(), field_value), end="") + first = False + print(");") + else: + print("%x %x %x" % (target, offset, value)) + +if __name__ == '__main__': + main() diff --git a/src/gallium/drivers/rocket/extract_registers.py b/src/gallium/drivers/rocket/extract_registers.py new file mode 100644 index 00000000000..c804b6afb23 --- /dev/null +++ b/src/gallium/drivers/rocket/extract_registers.py @@ -0,0 +1,121 @@ +#!/usr/bin/python3 +# +# Copyright © 2024-2025 Tomeu Vizoso +# +# SPDX-License-Identifier: MIT + +import collections +import csv +import subprocess +import sys +from itertools import dropwhile +import camelot + +trm_file = sys.argv[1] +if trm_file.endswith(".pdf"): + data = subprocess.check_output(["pdftotext", "-tsv", sys.argv[1], "-"]).decode() +else: + assert(trm_file.endswith(".txt")) + data = open(sys.argv[1]).read() + +data = csv.reader(data.splitlines(), delimiter="\t") +data = collections.deque([x[11] for x in data]) + +def popcell(data): + cell = [] + while data[0] != "###FLOW###": + text = data.popleft() + cell.append(text) + data.popleft() ###FLOW### + data.popleft() ###LINE### + return cell + +text = None +while data[0] != "RKNN_pc_operation_enable": + data.popleft() + +def read_reg_offset(data): + while data: + text = data.popleft() + if text.startswith("(0x"): + return text.replace("(", "").replace(")", "") + +reg_names = [] +offsets = [] +while text != "RKNN_global_operation_enable": + text = data.popleft() + + if text.startswith("RKNN_"): + reg_names.append(text) + offsets.append(read_reg_offset(data)) + +print("Found %d registers in RKNN block" % len(reg_names)) + +""" +print(reg_names) +print(offsets) +sys.exit(0) +""" + +tables = camelot.read_pdf(sys.argv[1], line_scale=35, pages="0-60") +tables = collections.deque([x.data for x in tables[3:]]) + +# Join tables split by page breaks +new_tables = [] +while tables: + new_table = tables.popleft() + last_bitfield = new_table[-1][0].split(" ")[0] + while last_bitfield != "0" and not last_bitfield.endswith(":0"): + second_part = tables.popleft() + new_table.extend(second_part[1:]) + last_bitfield = second_part[-1][0].split(" ")[0] + new_tables.append(new_table) +tables = new_tables +print("Found %d tables in PDF" % len(tables)) + +domains = {} +for i in range(0, len(reg_names)): + reg_name = reg_names[i] + if "dpu_rdma" in reg_name: + domain = "dpu_rdma" + elif "ppu_rdma" in reg_name: + domain = "ppu_rdma" + else: + domain = reg_name.split("_")[1] + table = tables[i] + + if domain not in domains.keys(): + domains[domain] = [] + + reg = {} + reg["name"] = reg_name + reg["offset"] = offsets[i] + reg["field_names"] = [] + reg["field_bits"] = [] + + reserved_count = 0 + for row in table[1:]: + name = row[3].split('\n')[0] + + if name == "reserved": + name = "reserved_%d" % reserved_count + reserved_count += 1 + + reg["field_bits"].append(row[0].split(' ')[0]) + reg["field_names"].append(name) + + domains[domain].append(reg) + +for domain in domains.keys(): + print(' <domain name="%s" width="32">' % domain.upper()) + for reg in domains[domain]: + print(' <reg32 offset="%s" name="%s">' % (reg["offset"], "_".join(reg["name"].strip().upper().split("_")[2:]))) + for i in range(0, len(reg["field_names"])): + if ":" in reg["field_bits"][i]: + high, low = reg["field_bits"][i].split(":") + bits = 'low="%s" high="%s"' % (low, high) + else: + bits = 'pos="%s"' % reg["field_bits"][i] + print(' <bitfield name="%s" %s type="uint"/>' % (reg["field_names"][i].strip().upper(), bits)) + print(' </reg32>') + print(' </domain>') diff --git a/src/gallium/drivers/rocket/gen_header.py b/src/gallium/drivers/rocket/gen_header.py new file mode 100644 index 00000000000..f3c6615dcb4 --- /dev/null +++ b/src/gallium/drivers/rocket/gen_header.py @@ -0,0 +1,137 @@ +#!/usr/bin/python3 +# +# Copyright © 2019-2024 Google, Inc. +# Copyright © 2024-2025 Tomeu Vizoso +# +# SPDX-License-Identifier: MIT + +import sys +import os +import argparse +import time +import datetime +from gen_parser import Parser, Reg, Enum, mask, Error + + +def dump_c(args, guard, func): + p = Parser() + + try: + p.parse(args.rnn, args.xml) + except Error as e: + print(e, file=sys.stderr) + exit(1) + + print("#ifndef %s\n#define %s\n" % (guard, guard)) + + print("""/* Autogenerated file, DO NOT EDIT manually! + +This file was generated by the rules-ng-ng gen_header.py tool in this git repository: +http://gitlab.freedesktop.org/mesa/mesa/ +git clone https://gitlab.freedesktop.org/mesa/mesa.git + +The rules-ng-ng source files this header was generated from are: +""") + maxlen = 0 + for filepath in p.xml_files: + maxlen = max(maxlen, len(filepath)) + for filepath in p.xml_files: + pad = " " * (maxlen - len(filepath)) + filesize = str(os.path.getsize(filepath)) + filesize = " " * (7 - len(filesize)) + filesize + filetime = time.ctime(os.path.getmtime(filepath)) + print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")") + if p.copyright_year: + current_year = str(datetime.date.today().year) + print() + print("Copyright (C) %s-%s by the following authors:" % (p.copyright_year, current_year)) + for author in p.authors: + print("- " + author) + if p.license: + print(p.license) + print("*/") + + print() + print("#ifdef __KERNEL__") + print("#include <linux/bug.h>") + print("#define assert(x) BUG_ON(!(x))") + print("#else") + print("#include <assert.h>") + print("#endif") + print() + + print("#ifdef __cplusplus") + print("#define __struct_cast(X)") + print("#else") + print("#define __struct_cast(X) (struct X)") + print("#endif") + print() + + func(p) + + print("static uint32_t rkt_get_target(uint32_t offset)") + print("{") + + print("\tswitch(offset) {") + for e in p.file: + if isinstance(e, Reg): + print("\t\tcase REG_%s:" % e.full_name) + print("\t\t\treturn %s;" % e.domain) + print("\t}") + print("\treturn 0;") + print("}") + + print("\n#endif /* %s */" % guard) + + +def dump_c_defines(args): + guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + dump_c(args, guard, lambda p: p.dump()) + + +def dump_c_pack_structs(args): + guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + '_STRUCTS' + dump_c(args, guard, lambda p: p.dump_structs()) + + +def dump_py_defines(args): + p = Parser() + + try: + p.parse(args.rnn, args.xml) + except Error as e: + print(e, file=sys.stderr) + exit(1) + + file_name = os.path.splitext(os.path.basename(args.xml))[0] + + print("from enum import IntEnum") + print("class %sRegs(IntEnum):" % file_name.upper()) + + os.path.basename(args.xml) + + p.dump_regs_py() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--rnn', type=str, required=True) + parser.add_argument('--xml', type=str, required=True) + + subparsers = parser.add_subparsers(required=True) + + parser_c_defines = subparsers.add_parser('c-defines') + parser_c_defines.set_defaults(func=dump_c_defines) + + parser_c_pack_structs = subparsers.add_parser('c-pack-structs') + parser_c_pack_structs.set_defaults(func=dump_c_pack_structs) + + parser_py_defines = subparsers.add_parser('py-defines') + parser_py_defines.set_defaults(func=dump_py_defines) + + args = parser.parse_args() + args.func(args) + + +if __name__ == '__main__': + main() diff --git a/src/gallium/drivers/rocket/gen_parser.py b/src/gallium/drivers/rocket/gen_parser.py new file mode 100644 index 00000000000..9ab1019b26f --- /dev/null +++ b/src/gallium/drivers/rocket/gen_parser.py @@ -0,0 +1,737 @@ +import xml.parsers.expat +import sys +import os +import collections + +class Error(Exception): + def __init__(self, message): + self.message = message + +class Enum(object): + def __init__(self, name): + self.name = name + self.values = [] + + def has_name(self, name): + for (n, value) in self.values: + if n == name: + return True + return False + + def dump(self): + use_hex = False + for (name, value) in self.values: + if value > 0x1000: + use_hex = True + + print("enum %s {" % self.name) + for (name, value) in self.values: + if use_hex: + print("\t%s = 0x%08x," % (name, value)) + else: + print("\t%s = %d," % (name, value)) + print("};\n") + + def dump_pack_struct(self): + pass + +class Field(object): + def __init__(self, name, low, high, shr, type, parser): + self.name = name + self.low = low + self.high = high + self.shr = shr + self.type = type + + builtin_types = [ None, "a3xx_regid", "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ] + + maxpos = parser.current_bitsize - 1 + + if low < 0 or low > maxpos: + raise parser.error("low attribute out of range: %d" % low) + if high < 0 or high > maxpos: + raise parser.error("high attribute out of range: %d" % high) + if high < low: + raise parser.error("low is greater than high: low=%d, high=%d" % (low, high)) + if self.type == "boolean" and not low == high: + raise parser.error("booleans should be 1 bit fields") + elif self.type == "float" and not (high - low == 31 or high - low == 15): + raise parser.error("floats should be 16 or 32 bit fields") + elif not self.type in builtin_types and not self.type in parser.enums: + raise parser.error("unknown type '%s'" % self.type) + + def ctype(self, var_name): + if self.type == None: + type = "uint32_t" + val = var_name + elif self.type == "boolean": + type = "bool" + val = var_name + elif self.type == "uint" or self.type == "hex" or self.type == "a3xx_regid": + type = "uint32_t" + val = var_name + elif self.type == "int": + type = "int32_t" + val = var_name + elif self.type == "fixed": + type = "float" + val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix) + elif self.type == "ufixed": + type = "float" + val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix) + elif self.type == "float" and self.high - self.low == 31: + type = "float" + val = "fui(%s)" % var_name + elif self.type == "float" and self.high - self.low == 15: + type = "float" + val = "_mesa_float_to_half(%s)" % var_name + elif self.type in [ "address", "waddress" ]: + type = "uint64_t" + val = var_name + else: + type = "enum %s" % self.type + val = var_name + + if self.shr > 0: + val = "(%s >> %d)" % (val, self.shr) + + return (type, val) + +def tab_to(name, value): + tab_count = (68 - (len(name) & ~7)) // 8 + if tab_count <= 0: + tab_count = 1 + print(name + ('\t' * tab_count) + value) + +def mask(low, high): + return ((0xffffffffffffffff >> (64 - (high + 1 - low))) << low) + +def field_name(reg, f): + if f.name: + name = f.name.lower() + else: + # We hit this path when a reg is defined with no bitset fields, ie. + # <reg32 offset="0x88db" name="RB_BLIT_DST_ARRAY_PITCH" low="0" high="28" shr="6" type="uint"/> + name = reg.name.lower() + + if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()): + name = "_" + name + + return name + +class Bitset(object): + def __init__(self, name, template): + self.name = name + self.inline = False + if template: + self.fields = template.fields[:] + else: + self.fields = [] + + # Get address field if there is one in the bitset, else return None: + def get_address_field(self): + for f in self.fields: + if f.type in [ "address", "waddress" ]: + return f + return None + + def dump_regpair_builder(self, reg): + print("#ifndef NDEBUG") + known_mask = 0 + for f in self.fields: + known_mask |= mask(f.low, f.high) + if f.type in [ "boolean", "address", "waddress" ]: + continue + type, val = f.ctype("fields.%s" % field_name(reg, f)) + print(" assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low))) + print(" assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask)) + print("#endif\n") + + print(" return (struct fd_reg_pair) {") + if reg.array: + print(" .reg = REG_%s(__i)," % reg.full_name) + else: + print(" .reg = REG_%s," % reg.full_name) + + print(" .value =") + for f in self.fields: + if f.type in [ "address", "waddress" ]: + continue + else: + type, val = f.ctype("fields.%s" % field_name(reg, f)) + print(" (%-40s << %2d) |" % (val, f.low)) + value_name = "dword" + if reg.bit_size == 64: + value_name = "qword" + print(" fields.unknown | fields.%s," % (value_name,)) + + address = self.get_address_field() + if address: + print(" .bo = fields.bo,") + print(" .is_address = true,") + if f.type == "waddress": + print(" .bo_write = true,") + print(" .bo_offset = fields.bo_offset,") + print(" .bo_shift = %d," % address.shr) + print(" .bo_low = %d," % address.low) + + print(" };") + + def dump_pack_struct(self, reg=None): + if not reg: + return + + prefix = reg.full_name + + print("struct %s {" % prefix) + for f in self.fields: + if f.type in [ "address", "waddress" ]: + tab_to(" __bo_type", "bo;") + tab_to(" uint32_t", "bo_offset;") + continue + name = field_name(reg, f) + + type, val = f.ctype("var") + + tab_to(" %s" % type, "%s;" % name) + if reg.bit_size == 64: + tab_to(" uint64_t", "unknown;") + tab_to(" uint64_t", "qword;") + else: + tab_to(" uint32_t", "unknown;") + tab_to(" uint32_t", "dword;") + print("};\n") + + if reg.array: + print("static inline struct fd_reg_pair\npack_%s(uint32_t __i, struct %s fields)\n{" % + (prefix, prefix)) + else: + print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" % + (prefix, prefix)) + + self.dump_regpair_builder(reg) + + print("\n}\n") + + if self.get_address_field(): + skip = ", { .reg = 0 }" + else: + skip = "" + + if reg.array: + print("#define %s(__i, ...) pack_%s(__i, __struct_cast(%s) { __VA_ARGS__ })%s\n" % + (prefix, prefix, prefix, skip)) + else: + print("#define %s(...) pack_%s(__struct_cast(%s) { __VA_ARGS__ })%s\n" % + (prefix, prefix, prefix, skip)) + + + def dump(self, prefix=None): + if prefix == None: + prefix = self.name + for f in self.fields: + if f.name: + name = prefix + "_" + f.name + else: + name = prefix + + if not f.name and f.low == 0 and f.shr == 0 and not f.type in ["float", "fixed", "ufixed"]: + pass + elif f.type == "boolean" or (f.type == None and f.low == f.high): + tab_to("#define %s" % name, "0x%08x" % (1 << f.low)) + else: + tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high)) + tab_to("#define %s__SHIFT" % name, "%d" % f.low) + type, val = f.ctype("val") + + print("static inline uint32_t %s(%s val)\n{" % (name, type)) + if f.shr > 0: + print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1)) + print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name)) + print() + +class Array(object): + def __init__(self, attrs, domain, variant): + if "name" in attrs: + self.name = attrs["name"] + else: + self.name = "" + self.domain = domain + self.variant = variant + self.offset = int(attrs["offset"], 0) + self.stride = int(attrs["stride"], 0) + self.length = int(attrs["length"], 0) + if "usage" in attrs: + self.usages = attrs["usage"].split(',') + else: + self.usages = None + + def dump(self): + print("#define REG_%s_%s(i0) (0x%08x + 0x%x*(i0))\n" % (self.domain, self.name, self.offset, self.stride)) + + def dump_pack_struct(self): + pass + + def dump_regpair_builder(self): + pass + +class Reg(object): + def __init__(self, attrs, domain, array, bit_size): + self.name = attrs["name"] + self.domain = domain + self.array = array + self.offset = int(attrs["offset"], 0) + self.type = None + self.bit_size = bit_size + if array: + self.name = array.name + "_" + self.name + self.full_name = self.domain + "_" + self.name + + def dump(self): + if self.array: + offset = self.array.offset + self.offset + print("static inline uint32_t REG_%s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride)) + else: + tab_to("#define REG_%s" % self.full_name, "0x%08x" % self.offset) + + if self.bitset.inline: + self.bitset.dump(self.full_name) + + def dump_pack_struct(self): + if self.bitset.inline: + self.bitset.dump_pack_struct(self) + + def dump_regpair_builder(self): + if self.bitset.inline: + self.bitset.dump_regpair_builder(self) + + def dump_py(self): + print("\tREG_%s = 0x%08x" % (self.full_name, self.offset)) + + +class Parser(object): + def __init__(self): + self.current_array = None + self.current_domain = None + self.current_prefix = None + self.current_prefix_type = None + self.current_stripe = None + self.current_bitset = None + self.current_bitsize = 32 + # The varset attribute on the domain specifies the enum which + # specifies all possible hw variants: + self.current_varset = None + # Regs that have multiple variants.. we only generated the C++ + # template based struct-packers for these + self.variant_regs = {} + # Information in which contexts regs are used, to be used in + # debug options + self.usage_regs = collections.defaultdict(list) + self.bitsets = {} + self.enums = {} + self.variants = set() + self.file = [] + self.xml_files = [] + self.copyright_year = None + self.authors = [] + self.license = None + + def error(self, message): + parser, filename = self.stack[-1] + return Error("%s:%d:%d: %s" % (filename, parser.CurrentLineNumber, parser.CurrentColumnNumber, message)) + + def prefix(self, variant=None): + if self.current_prefix_type == "variant" and variant: + return variant + elif self.current_stripe: + return self.current_stripe + "_" + self.current_domain + elif self.current_prefix: + return self.current_prefix + "_" + self.current_domain + else: + return self.current_domain + + def parse_field(self, name, attrs): + try: + if "pos" in attrs: + high = low = int(attrs["pos"], 0) + elif "high" in attrs and "low" in attrs: + high = int(attrs["high"], 0) + low = int(attrs["low"], 0) + else: + low = 0 + high = self.current_bitsize - 1 + + if "type" in attrs: + type = attrs["type"] + else: + type = None + + if "shr" in attrs: + shr = int(attrs["shr"], 0) + else: + shr = 0 + + b = Field(name, low, high, shr, type, self) + + if type == "fixed" or type == "ufixed": + b.radix = int(attrs["radix"], 0) + + self.current_bitset.fields.append(b) + except ValueError as e: + raise self.error(e) + + def parse_varset(self, attrs): + # Inherit the varset from the enclosing domain if not overriden: + varset = self.current_varset + if "varset" in attrs: + varset = self.enums[attrs["varset"]] + return varset + + def parse_variants(self, attrs): + if not "variants" in attrs: + return None + variant = attrs["variants"].split(",")[0] + if "-" in variant: + variant = variant[:variant.index("-")] + + varset = self.parse_varset(attrs) + + assert varset.has_name(variant) + + return variant + + def add_all_variants(self, reg, attrs, parent_variant): + # TODO this should really handle *all* variants, including dealing + # with open ended ranges (ie. "A2XX,A4XX-") (we have the varset + # enum now to make that possible) + variant = self.parse_variants(attrs) + if not variant: + variant = parent_variant + + if reg.name not in self.variant_regs: + self.variant_regs[reg.name] = {} + else: + # All variants must be same size: + v = next(iter(self.variant_regs[reg.name])) + assert self.variant_regs[reg.name][v].bit_size == reg.bit_size + + self.variant_regs[reg.name][variant] = reg + + def add_all_usages(self, reg, usages): + if not usages: + return + + for usage in usages: + self.usage_regs[usage].append(reg) + + self.variants.add(reg.domain) + + def do_validate(self, schemafile): + try: + from lxml import etree + + parser, filename = self.stack[-1] + dirname = os.path.dirname(filename) + + # we expect this to look like <namespace url> schema.xsd.. I think + # technically it is supposed to be just a URL, but that doesn't + # quite match up to what we do.. Just skip over everything up to + # and including the first whitespace character: + schemafile = schemafile[schemafile.rindex(" ")+1:] + + # this is a bit cheezy, but the xml file to validate could be + # in a child director, ie. we don't really know where the schema + # file is, the way the rnn C code does. So if it doesn't exist + # just look one level up + if not os.path.exists(dirname + "/" + schemafile): + schemafile = "../" + schemafile + + if not os.path.exists(dirname + "/" + schemafile): + raise self.error("Cannot find schema for: " + filename) + + xmlschema_doc = etree.parse(dirname + "/" + schemafile) + xmlschema = etree.XMLSchema(xmlschema_doc) + + xml_doc = etree.parse(filename) + if not xmlschema.validate(xml_doc): + error_str = str(xmlschema.error_log.filter_from_errors()[0]) + raise self.error("Schema validation failed for: " + filename + "\n" + error_str) + except ImportError: + print("lxml not found, skipping validation", file=sys.stderr) + + def do_parse(self, filename): + filepath = os.path.abspath(filename) + if filepath in self.xml_files: + return + self.xml_files.append(filepath) + file = open(filename, "rb") + parser = xml.parsers.expat.ParserCreate() + self.stack.append((parser, filename)) + parser.StartElementHandler = self.start_element + parser.EndElementHandler = self.end_element + parser.CharacterDataHandler = self.character_data + parser.buffer_text = True + parser.ParseFile(file) + self.stack.pop() + file.close() + + def parse(self, rnn_path, filename): + self.path = rnn_path + self.stack = [] + self.do_parse(filename) + + def parse_reg(self, attrs, bit_size): + self.current_bitsize = bit_size + if "type" in attrs and attrs["type"] in self.bitsets: + bitset = self.bitsets[attrs["type"]] + if bitset.inline: + self.current_bitset = Bitset(attrs["name"], bitset) + self.current_bitset.inline = True + else: + self.current_bitset = bitset + else: + self.current_bitset = Bitset(attrs["name"], None) + self.current_bitset.inline = True + if "type" in attrs: + self.parse_field(None, attrs) + + variant = self.parse_variants(attrs) + if not variant and self.current_array: + variant = self.current_array.variant + + self.current_reg = Reg(attrs, self.prefix(variant), self.current_array, bit_size) + self.current_reg.bitset = self.current_bitset + + if len(self.stack) == 1: + self.file.append(self.current_reg) + + if variant is not None: + self.add_all_variants(self.current_reg, attrs, variant) + + usages = None + if "usage" in attrs: + usages = attrs["usage"].split(',') + elif self.current_array: + usages = self.current_array.usages + + self.add_all_usages(self.current_reg, usages) + + def start_element(self, name, attrs): + self.cdata = "" + if name == "import": + filename = attrs["file"] + self.do_parse(os.path.join(self.path, filename)) + elif name == "domain": + self.current_domain = attrs["name"] + if "prefix" in attrs: + self.current_prefix = self.parse_variants(attrs) + self.current_prefix_type = attrs["prefix"] + else: + self.current_prefix = None + self.current_prefix_type = None + if "varset" in attrs: + self.current_varset = self.enums[attrs["varset"]] + elif name == "stripe": + self.current_stripe = self.parse_variants(attrs) + elif name == "enum": + self.current_enum_value = 0 + self.current_enum = Enum(attrs["name"]) + self.enums[attrs["name"]] = self.current_enum + if len(self.stack) == 1: + self.file.append(self.current_enum) + elif name == "value": + if "value" in attrs: + value = int(attrs["value"], 0) + else: + value = self.current_enum_value + self.current_enum.values.append((attrs["name"], value)) + elif name == "reg32": + self.parse_reg(attrs, 32) + elif name == "reg64": + self.parse_reg(attrs, 64) + elif name == "array": + self.current_bitsize = 32 + variant = self.parse_variants(attrs) + self.current_array = Array(attrs, self.prefix(variant), variant) + if len(self.stack) == 1: + self.file.append(self.current_array) + elif name == "bitset": + self.current_bitset = Bitset(attrs["name"], None) + if "inline" in attrs and attrs["inline"] == "yes": + self.current_bitset.inline = True + self.bitsets[self.current_bitset.name] = self.current_bitset + if len(self.stack) == 1 and not self.current_bitset.inline: + self.file.append(self.current_bitset) + elif name == "bitfield" and self.current_bitset: + self.parse_field(attrs["name"], attrs) + elif name == "database": + self.do_validate(attrs["xsi:schemaLocation"]) + elif name == "copyright": + self.copyright_year = attrs["year"] + elif name == "author": + self.authors.append(attrs["name"] + " <" + attrs["email"] + "> " + attrs["name"]) + + def end_element(self, name): + if name == "domain": + self.current_domain = None + self.current_prefix = None + self.current_prefix_type = None + elif name == "stripe": + self.current_stripe = None + elif name == "bitset": + self.current_bitset = None + elif name == "reg32": + self.current_reg = None + elif name == "array": + self.current_array = None + elif name == "enum": + self.current_enum = None + elif name == "license": + self.license = self.cdata + + def character_data(self, data): + self.cdata += data + + def dump_reg_usages(self): + d = collections.defaultdict(list) + for usage, regs in self.usage_regs.items(): + for reg in regs: + variants = self.variant_regs.get(reg.name) + if variants: + for variant, vreg in variants.items(): + if reg == vreg: + d[(usage, variant)].append(reg) + else: + for variant in self.variants: + d[(usage, variant)].append(reg) + + print("#ifdef __cplusplus") + + for usage, regs in self.usage_regs.items(): + print("template<chip CHIP> constexpr inline uint16_t %s_REGS[] = {};" % (usage.upper())) + + for (usage, variant), regs in d.items(): + offsets = [] + + for reg in regs: + if reg.array: + for i in range(reg.array.length): + offsets.append(reg.array.offset + reg.offset + i * reg.array.stride) + if reg.bit_size == 64: + offsets.append(offsets[-1] + 1) + else: + offsets.append(reg.offset) + if reg.bit_size == 64: + offsets.append(offsets[-1] + 1) + + offsets.sort() + + print("template<> constexpr inline uint16_t %s_REGS<%s>[] = {" % (usage.upper(), variant)) + for offset in offsets: + print("\t%s," % hex(offset)) + print("};") + + print("#endif") + + def dump(self): + enums = [] + bitsets = [] + regs = [] + for e in self.file: + if isinstance(e, Enum): + enums.append(e) + elif isinstance(e, Bitset): + bitsets.append(e) + else: + regs.append(e) + + for e in enums + bitsets + regs: + e.dump() + + self.dump_reg_usages() + + + def dump_regs_py(self): + regs = [] + for e in self.file: + if isinstance(e, Reg): + regs.append(e) + + for e in regs: + e.dump_py() + + + def dump_reg_variants(self, regname, variants): + # Don't bother for things that only have a single variant: + if len(variants) == 1: + return + print("#ifdef __cplusplus") + print("struct __%s {" % regname) + # TODO be more clever.. we should probably figure out which + # fields have the same type in all variants (in which they + # appear) and stuff everything else in a variant specific + # sub-structure. + seen_fields = [] + bit_size = 32 + array = False + address = None + for variant in variants.keys(): + print(" /* %s fields: */" % variant) + reg = variants[variant] + bit_size = reg.bit_size + array = reg.array + for f in reg.bitset.fields: + fld_name = field_name(reg, f) + if fld_name in seen_fields: + continue + seen_fields.append(fld_name) + name = fld_name.lower() + if f.type in [ "address", "waddress" ]: + if address: + continue + address = f + tab_to(" __bo_type", "bo;") + tab_to(" uint32_t", "bo_offset;") + continue + type, val = f.ctype("var") + tab_to(" %s" %type, "%s;" %name) + print(" /* fallback fields: */") + if bit_size == 64: + tab_to(" uint64_t", "unknown;") + tab_to(" uint64_t", "qword;") + else: + tab_to(" uint32_t", "unknown;") + tab_to(" uint32_t", "dword;") + print("};") + # TODO don't hardcode the varset enum name + varenum = "chip" + print("template <%s %s>" % (varenum, varenum.upper())) + print("static inline struct fd_reg_pair") + xtra = "" + xtravar = "" + if array: + xtra = "int __i, " + xtravar = "__i, " + print("__%s(%sstruct __%s fields) {" % (regname, xtra, regname)) + for variant in variants.keys(): + print(" if (%s == %s) {" % (varenum.upper(), variant)) + reg = variants[variant] + reg.dump_regpair_builder() + print(" } else") + print(" assert(!\"invalid variant\");") + print("}") + + if bit_size == 64: + skip = ", { .reg = 0 }" + else: + skip = "" + + print("#define %s(VARIANT, %s...) __%s<VARIANT>(%s{__VA_ARGS__})%s" % (regname, xtravar, regname, xtravar, skip)) + print("#endif /* __cplusplus */") + + def dump_structs(self): + for e in self.file: + e.dump_pack_struct() + + for regname in self.variant_regs: + self.dump_reg_variants(regname, self.variant_regs[regname]) diff --git a/src/gallium/drivers/rocket/intercept.c b/src/gallium/drivers/rocket/intercept.c new file mode 100644 index 00000000000..6ffb8647d61 --- /dev/null +++ b/src/gallium/drivers/rocket/intercept.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include <dlfcn.h> +#include <fcntl.h> +#include <linux/limits.h> +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "drm-uapi/rknpu_ioctl.h" +#include "rkt_registers.h" + +// #define GETENV 1 + +struct bo { + int handle; + unsigned size; + uint64_t obj_addr; + uint64_t dma_addr; +}; + +#define MAX_BOS 3000 + +struct context { + int dump_file; + int device_fd; + struct bo bos[MAX_BOS]; + unsigned next_handle_id; +}; + +struct context context = {0}; + +static void +dump_log(const char *format, ...) +{ + va_list args; + va_start(args, format); + + int dump_fd = open("rknpu.log", O_CREAT | O_RDWR | O_APPEND, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); + vdprintf(dump_fd, format, args); + close(dump_fd); + + va_end(args); +} + +#define L(...) dump_log(__VA_ARGS__); + +static void * +map_bo(struct bo *bo) +{ + struct rknpu_mem_map req = {0}; + + req.handle = bo->handle; + ioctl(context.device_fd, DRM_IOCTL_RKNPU_MEM_MAP, &req); + return mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + context.device_fd, req.offset); +} + +static struct bo * +find_bo(uint64_t dma_address, unsigned *offset) +{ + for (int j = 0; j < context.next_handle_id; j++) { + fprintf(stderr, "needle %lx hay %lx i %d\n", dma_address, + context.bos[j].dma_addr, j); + if (dma_address >= context.bos[j].dma_addr && + dma_address < context.bos[j].dma_addr + context.bos[j].size) { + *offset = dma_address - context.bos[j].dma_addr; + return &context.bos[j]; + } + } + + return NULL; +} + +static void +dump_buffer(const char *name, uint64_t dma_address, unsigned size) +{ + unsigned offset = 0; + struct bo *bo = find_bo(dma_address, &offset); + + fprintf(stderr, "dump_buffer name %s dma 0x%lx size %u bo %p\n", name, + dma_address, size, bo); + + if (size == 0 || size + offset > bo->size) + size = bo->size - offset; + + int fd = open(name, O_CREAT | O_RDWR | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); + write(fd, map_bo(bo) + offset, size); + close(fd); +} + +static unsigned task_id = 0; + +static int +handle_submit(struct rknpu_submit *args, uint32_t *output_address) +{ + int ret = 0; + + L("struct rknpu_submit submit = {\n"); + L(" .flags = %x,\n", args->flags); + L(" .timeout = %d,\n", args->timeout); + L(" .task_start = %d,\n", args->task_start); + L(" .task_number = %d,\n", args->task_number); + L(" .task_counter = %d,\n", args->task_counter); + L(" .priority = %d,\n", args->priority); + L(" .task_obj_addr = 0x%llx,\n", args->task_obj_addr); + L(" .regcfg_obj_addr = 0x%llx,\n", args->regcfg_obj_addr); + L(" .task_base_addr = 0x%llx,\n", args->task_base_addr); + L(" .user_data = 0x%llx,\n", args->user_data); + L(" .core_mask = %x,\n", args->core_mask); + L(" .fence_fd = %d,\n", args->fence_fd); + L(" .subcore_task = {\n"); + L(" {\n"); + L(" .task_start = %d,\n", args->subcore_task[0].task_start); + L(" .task_number = %d,\n", args->subcore_task[0].task_number); + L(" },\n"); + L(" {\n"); + L(" .task_start = %d,\n", args->subcore_task[1].task_start); + L(" .task_number = %d,\n", args->subcore_task[1].task_number); + L(" },\n"); + L(" {\n"); + L(" .task_start = %d,\n", args->subcore_task[2].task_start); + L(" .task_number = %d,\n", args->subcore_task[2].task_number); + L(" },\n"); + L(" },\n"); + L("};\n"); + + struct bo *task_bo = NULL; + for (int i = 0; i < context.next_handle_id; i++) { + if (context.bos[i].obj_addr == args->task_obj_addr) { + task_bo = &context.bos[i]; + break; + } + } + + struct rknpu_task *tasks = map_bo(task_bo); + for (int i = args->task_start; i < args->task_start + args->task_number / 3; + i++) { + L("tasks[%d].flags = 0x%x;\n", i, tasks[i].flags); + L("tasks[%d].op_idx = %d;\n", i, tasks[i].op_idx); + L("tasks[%d].enable_mask = 0x%x;\n", i, tasks[i].enable_mask); + L("tasks[%d].int_mask = 0x%x;\n", i, tasks[i].int_mask); + L("tasks[%d].int_clear = 0x%x;\n", i, tasks[i].int_clear); + L("tasks[%d].regcfg_amount = %d;\n", i, tasks[i].regcfg_amount); + L("tasks[%d].regcfg_offset = 0x%x;\n", i, tasks[i].regcfg_offset); + L("tasks[%d].regcmd_addr = 0x%llx;\n", i, tasks[i].regcmd_addr); + + if (tasks[i].regcmd_addr == 0x0) + continue; + + char name[PATH_MAX]; + unsigned size = (tasks[i].regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT) * + sizeof(uint64_t); + sprintf(name, "regcmd%d.bin", task_id); + dump_buffer(name, tasks[i].regcmd_addr + tasks[i].regcfg_offset, size); + + uint32_t input_address = 0x0; + *output_address = 0x0; + uint32_t weights_address = 0x0; + uint32_t biases_address = 0x0; + uint32_t eltwise_address = 0x0; + + unsigned offset = 0; + struct bo *bo = + find_bo(tasks[i].regcmd_addr + tasks[i].regcfg_offset, &offset); + uint64_t *regcmd = map_bo(bo) + offset; + for (int j = 0; j < tasks[i].regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT; + j++) { + switch (regcmd[j] & 0xffff) { + case REG_CNA_FEATURE_DATA_ADDR: + input_address = (regcmd[j] & 0xffffffff0000) >> 16; + break; + case REG_CNA_DCOMP_ADDR0: + weights_address = (regcmd[j] & 0xffffffff0000) >> 16; + break; + case REG_DPU_DST_BASE_ADDR: + if (*output_address == 0x0) + *output_address = (regcmd[j] & 0xffffffff0000) >> 16; + break; + case REG_DPU_RDMA_RDMA_BS_BASE_ADDR: + biases_address = (regcmd[j] & 0xffffffff0000) >> 16; + break; + case REG_DPU_RDMA_RDMA_EW_BASE_ADDR: + eltwise_address = (regcmd[j] & 0xffffffff0000) >> 16; + break; + } + } + + fprintf(stderr, "weights_address %x\n", weights_address); + fprintf(stderr, "input_address %x\n", input_address); + fprintf(stderr, "output_address %x\n", *output_address); + fprintf(stderr, "biases_address %x\n", biases_address); + fprintf(stderr, "eltwise_address %x\n", eltwise_address); + + if (weights_address != 0x0) { + sprintf(name, "weights%d.bin", task_id); + dump_buffer(name, weights_address, 0); + } + + if (biases_address != 0x0) { + sprintf(name, "biases%d.bin", task_id); + dump_buffer(name, biases_address, 0); + } + + if (eltwise_address != 0x0) { + sprintf(name, "eltwise%d.bin", task_id); + dump_buffer(name, eltwise_address, 0); + } + + if (input_address != 0x0) { + sprintf(name, "input%d.bin", task_id); + dump_buffer(name, input_address, 0); + } + + task_id++; + } + + return ret; +} + +static void +handle_mem_sync(struct rknpu_mem_sync *args) +{ + L("struct rknpu_mem_sync sync = {\n"); + L(" .flags = 0x%x,\n", args->flags); + L(" .reserved = 0x%x,\n", args->reserved); + L(" .obj_addr = 0x%llx,\n", args->obj_addr); + L(" .offset = 0x%llx,\n", args->offset); + L(" .size = %llu,\n", args->size); + L("};\n"); +} + +static int +handle_mem_create(struct rknpu_mem_create *args) +{ + int ret = 0; + +#if 0 + L("struct rknpu_mem_create create = {\n"); + L(" .dma_addr = 0x%llx,\n", args->dma_addr); + L(" .flags = 0x%x,\n", args->flags); + L(" .handle = %u,\n", args->handle); + L(" .obj_addr = 0x%llx,\n", args->obj_addr); + L(" .size = %llu,\n", args->size); + L("};\n"); +#endif + + assert(context.next_handle_id < MAX_BOS); + + context.bos[context.next_handle_id].handle = args->handle; + context.bos[context.next_handle_id].size = args->size; + context.bos[context.next_handle_id].obj_addr = args->obj_addr; + context.bos[context.next_handle_id].dma_addr = args->dma_addr; + + fprintf(stderr, "%s: dma_addr %llx\n", __func__, args->dma_addr); + context.next_handle_id++; + + return ret; +} + +static void +handle_action(struct rknpu_action *args) +{ + switch (args->flags) { + case RKNPU_GET_HW_VERSION: + L("%s: RKNPU_GET_HW_VERSION %x\n", __func__, args->value); + break; + case RKNPU_GET_DRV_VERSION: + L("%s: RKNPU_GET_DRV_VERSION %x\n", __func__, args->value); + break; + case RKNPU_POWER_ON: + L("%s: RKNPU_POWER_ON %x\n", __func__, args->value); + break; + case RKNPU_GET_IOMMU_EN: + L("%s: RKNPU_GET_IOMMU_EN %x\n", __func__, args->value); + break; + case RKNPU_SET_PROC_NICE: + L("%s: RKNPU_SET_PROC_NICE %x\n", __func__, args->value); + break; + case RKNPU_GET_FREQ: + L("%s: RKNPU_GET_FREQ %x\n", __func__, args->value); + break; + default: + L("%s: unhandled action %d %x\n", __func__, args->flags, args->value); + break; + } +} + +typedef int (*real_ioctl_t)(int fd, unsigned long request, ...); +int +ioctl(int fd, unsigned long request, ...) +{ + int ret; + uint32_t output_address = 0; + + va_list ap; + va_start(ap, request); + void *ptr_ = va_arg(ap, void *); + va_end(ap); + + real_ioctl_t real_ioctl; + real_ioctl = (real_ioctl_t)dlsym(RTLD_NEXT, "ioctl"); + + switch (request) { + case DRM_IOCTL_RKNPU_SUBMIT: + handle_submit(ptr_, &output_address); + break; + case DRM_IOCTL_RKNPU_MEM_SYNC: + // handle_mem_sync(ptr_); + break; + case DRM_IOCTL_RKNPU_ACTION: + // handle_action(ptr_); + break; + } + + ret = real_ioctl(fd, request, ptr_); + + switch (request) { + case DRM_IOCTL_RKNPU_SUBMIT: { + char name[PATH_MAX]; + sprintf(name, "output%d.bin", task_id); + dump_buffer(name, output_address, 0); + + break; + } + case DRM_IOCTL_RKNPU_MEM_CREATE: + case IOCTL_RKNPU_MEM_CREATE: + case 0xc0286442: + handle_mem_create(ptr_); + context.device_fd = fd; + break; + } + + return ret; +} + +/* Intended to be called from GDB when the underlying memory is not directly + * accessible to it. */ +void dump_mem(uint32_t *ptr, unsigned bytes); + +void +dump_mem(uint32_t *ptr, unsigned bytes) +{ + for (int i = 0; i < bytes / 4; i++) { + fprintf(stderr, "%08x %08x %08x %08x\n", ptr[0], ptr[1], ptr[2], ptr[3]); + ptr += 4; + } +} + +#ifdef GETENV +typedef char *(*real_getenv_t)(const char *name); +char * +getenv(const char *name) +{ + real_getenv_t real_getenv; + real_getenv = (real_getenv_t)dlsym(RTLD_NEXT, "getenv"); + + fprintf(stderr, "getenv %s\n", name); + + return real_getenv(name); +} + +#endif diff --git a/src/gallium/drivers/rocket/meson.build b/src/gallium/drivers/rocket/meson.build new file mode 100644 index 00000000000..f327154e328 --- /dev/null +++ b/src/gallium/drivers/rocket/meson.build @@ -0,0 +1,38 @@ +# Copyright 2019 Google, Inc +# SPDX-License-Identifier: MIT + +rocket_registers = custom_target( + 'rkt_registers.h', + input : ['gen_header.py', 'registers.xml'], + output : 'rkt_registers.h', + command : [prog_python, '@INPUT0@', '--rnn', '.', '--xml', '@INPUT1@', 'c-defines'], + capture : true, +) + +files_rocket = files( + 'rkt_coefs.c', + 'rkt_device.c', + 'rkt_ml.c', + 'rkt_regcmd.c', + 'rkt_task.c', +) + +librocket = static_library( + 'rocket', + [files_rocket, rocket_registers], + include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src], + gnu_symbol_visibility : 'hidden', + dependencies : [idep_mesautil, dep_libdrm], +) + +driver_rocket = declare_dependency( + compile_args : '-DGALLIUM_ROCKET', + link_with : [librocketwinsys, librocket] +) + +shared_library('intercept', + [files('intercept.c'), rocket_registers], + include_directories : [inc_include], + dependencies : [dep_libdrm], + c_args: ['-Wno-error=missing-prototypes', '-g', '-O0'] +) diff --git a/src/gallium/drivers/rocket/registers.xml b/src/gallium/drivers/rocket/registers.xml new file mode 100644 index 00000000000..8410edab5f4 --- /dev/null +++ b/src/gallium/drivers/rocket/registers.xml @@ -0,0 +1,1179 @@ +<?xml version="1.0" encoding="UTF-8"?> +<database xmlns="http://nouveau.freedesktop.org/" +xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> + +<copyright year="2024"> + +<author name="Tomeu Vizoso" email="tomeu@tomeuvizoso.net"><nick name="tomeu"/> +Initial Author. +</author> + +<license> +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice (including the +next paragraph) shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +</license> + +</copyright> + +<enum name="target"> + <value name="PC" value="0x100"/> + <value name="CNA" value="0x200"/> + <value name="CORE" value="0x800"/> + <value name="DPU" value="0x1000"/> + <value name="DPU_RDMA" value="0x2000"/> + <value name="PPU" value="0x4000"/> + <value name="PPU_RDMA" value="0x8000"/> + <value name="DDMA" value="0x10000"/> + <value name="SDMA" value="0x20000"/> + <value name="GLOBAL" value="0x40000"/> +</enum> + +<domain name="PC" width="32"> + <reg32 offset="0x0000" name="VERSION"> + <bitfield name="VERSION" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x0004" name="VERSION_NUM"> + <bitfield name="VERSION_NUM" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x0008" name="OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x0010" name="BASE_ADDRESS"> + <bitfield name="PC_SOURCE_ADDR" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="1" high="3" type="uint"/> + <bitfield name="PC_SEL" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x0014" name="REGISTER_AMOUNTS"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="PC_DATA_AMOUNT" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x0020" name="INTERRUPT_MASK"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/> + <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/> + <bitfield name="PPU_1" pos="11" type="boolean"/> + <bitfield name="PPU_0" pos="10" type="boolean"/> + <bitfield name="DPU_1" pos="9" type="boolean"/> + <bitfield name="DPU_0" pos="8" type="boolean"/> + <bitfield name="CORE_1" pos="7" type="boolean"/> + <bitfield name="CORE_0" pos="6" type="boolean"/> + <bitfield name="CNA_CSC_1" pos="5" type="boolean"/> + <bitfield name="CNA_CSC_0" pos="4" type="boolean"/> + <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/> + <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/> + <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/> + <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/> + </reg32> + <reg32 offset="0x0024" name="INTERRUPT_CLEAR"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/> + <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/> + <bitfield name="PPU_1" pos="11" type="boolean"/> + <bitfield name="PPU_0" pos="10" type="boolean"/> + <bitfield name="DPU_1" pos="9" type="boolean"/> + <bitfield name="DPU_0" pos="8" type="boolean"/> + <bitfield name="CORE_1" pos="7" type="boolean"/> + <bitfield name="CORE_0" pos="6" type="boolean"/> + <bitfield name="CNA_CSC_1" pos="5" type="boolean"/> + <bitfield name="CNA_CSC_0" pos="4" type="boolean"/> + <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/> + <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/> + <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/> + <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/> + </reg32> + <reg32 offset="0x0028" name="INTERRUPT_STATUS"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/> + <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/> + <bitfield name="PPU_1" pos="11" type="boolean"/> + <bitfield name="PPU_0" pos="10" type="boolean"/> + <bitfield name="DPU_1" pos="9" type="boolean"/> + <bitfield name="DPU_0" pos="8" type="boolean"/> + <bitfield name="CORE_1" pos="7" type="boolean"/> + <bitfield name="CORE_0" pos="6" type="boolean"/> + <bitfield name="CNA_CSC_1" pos="5" type="boolean"/> + <bitfield name="CNA_CSC_0" pos="4" type="boolean"/> + <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/> + <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/> + <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/> + <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/> + </reg32> + <reg32 offset="0x002C" name="INTERRUPT_RAW_STATUS"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="DMA_WRITE_ERROR" pos="13" type="boolean"/> + <bitfield name="DMA_READ_ERROR" pos="12" type="boolean"/> + <bitfield name="PPU_1" pos="11" type="boolean"/> + <bitfield name="PPU_0" pos="10" type="boolean"/> + <bitfield name="DPU_1" pos="9" type="boolean"/> + <bitfield name="DPU_0" pos="8" type="boolean"/> + <bitfield name="CORE_1" pos="7" type="boolean"/> + <bitfield name="CORE_0" pos="6" type="boolean"/> + <bitfield name="CNA_CSC_1" pos="5" type="boolean"/> + <bitfield name="CNA_CSC_0" pos="4" type="boolean"/> + <bitfield name="CNA_WEIGHT_1" pos="3" type="boolean"/> + <bitfield name="CNA_WEIGHT_0" pos="2" type="boolean"/> + <bitfield name="CNA_FEATURE_1" pos="1" type="boolean"/> + <bitfield name="CNA_FEATURE_0" pos="0" type="boolean"/> + </reg32> + <reg32 offset="0x0030" name="TASK_CON"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="TASK_COUNT_CLEAR" pos="13" type="uint"/> + <bitfield name="TASK_PP_EN" pos="12" type="uint"/> + <bitfield name="TASK_NUMBER" low="0" high="11" type="uint"/> + </reg32> + <reg32 offset="0x0034" name="TASK_DMA_BASE_ADDR"> + <bitfield name="DMA_BASE_ADDR" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x003C" name="TASK_STATUS"> + <bitfield name="RESERVED_0" low="28" high="31" type="uint"/> + <bitfield name="TASK_STATUS" low="0" high="27" type="uint"/> + </reg32> +</domain> +<domain name="CNA" width="32"> + <reg32 offset="0x1000" name="S_STATUS"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="STATUS_1" low="16" high="17" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="15" type="uint"/> + <bitfield name="STATUS_0" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x1004" name="S_POINTER"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="EXECUTER" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="6" high="15" type="uint"/> + <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/> + <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/> + <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/> + <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/> + <bitfield name="POINTER_PP_EN" pos="1" type="uint"/> + <bitfield name="POINTER" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x1008" name="OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x100C" name="CONV_CON1"> + <bitfield name="RESERVED_0" pos="31" type="uint"/> + <bitfield name="NONALIGN_DMA" pos="30" type="uint"/> + <bitfield name="GROUP_LINE_OFF" pos="29" type="uint"/> + <bitfield name="RESERVED_1" low="17" high="28" type="uint"/> + <bitfield name="DECONV" pos="16" type="uint"/> + <bitfield name="ARGB_IN" low="12" high="15" type="uint"/> + <bitfield name="RESERVED_2" low="10" high="11" type="uint"/> + <bitfield name="PROC_PRECISION" low="7" high="9" type="uint"/> + <bitfield name="IN_PRECISION" low="4" high="6" type="uint"/> + <bitfield name="CONV_MODE" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x1010" name="CONV_CON2"> + <bitfield name="RESERVED_0" low="24" high="31" type="uint"/> + <bitfield name="KERNEL_GROUP" low="16" high="23" type="uint"/> + <bitfield name="RESERVED_1" low="14" high="15" type="uint"/> + <bitfield name="FEATURE_GRAINS" low="4" high="13" type="uint"/> + <bitfield name="RESERVED_2" pos="3" type="uint"/> + <bitfield name="CSC_WO_EN" pos="2" type="uint"/> + <bitfield name="CSC_DO_EN" pos="1" type="uint"/> + <bitfield name="CMD_FIFO_SRST" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x1014" name="CONV_CON3"> + <bitfield name="RESERVED_0" pos="31" type="uint"/> + <bitfield name="NN_MODE" low="28" high="30" type="uint"/> + <bitfield name="RESERVED_1" low="26" high="27" type="uint"/> + <bitfield name="ATROUS_Y_DILATION" low="21" high="25" type="uint"/> + <bitfield name="ATROUS_X_DILATION" low="16" high="20" type="uint"/> + <bitfield name="RESERVED_2" low="14" high="15" type="uint"/> + <bitfield name="DECONV_Y_STRIDE" low="11" high="13" type="uint"/> + <bitfield name="DECONV_X_STRIDE" low="8" high="10" type="uint"/> + <bitfield name="RESERVED_3" low="6" high="7" type="uint"/> + <bitfield name="CONV_Y_STRIDE" low="3" high="5" type="uint"/> + <bitfield name="CONV_X_STRIDE" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x1020" name="DATA_SIZE0"> + <bitfield name="RESERVED_0" low="27" high="31" type="uint"/> + <bitfield name="DATAIN_WIDTH" low="16" high="26" type="uint"/> + <bitfield name="RESERVED_1" low="11" high="15" type="uint"/> + <bitfield name="DATAIN_HEIGHT" low="0" high="10" type="uint"/> + </reg32> + <reg32 offset="0x1024" name="DATA_SIZE1"> + <bitfield name="RESERVED_0" low="30" high="31" type="uint"/> + <bitfield name="DATAIN_CHANNEL_REAL" low="16" high="29" type="uint"/> + <bitfield name="DATAIN_CHANNEL" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x1028" name="DATA_SIZE2"> + <bitfield name="RESERVED_0" low="11" high="31" type="uint"/> + <bitfield name="DATAOUT_WIDTH" low="0" high="10" type="uint"/> + </reg32> + <reg32 offset="0x102C" name="DATA_SIZE3"> + <bitfield name="RESERVED_0" low="24" high="31" type="uint"/> + <bitfield name="SURF_MODE" low="22" high="23" type="uint"/> + <bitfield name="DATAOUT_ATOMICS" low="0" high="21" type="uint"/> + </reg32> + <reg32 offset="0x1030" name="WEIGHT_SIZE0"> + <bitfield name="WEIGHT_BYTES" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1034" name="WEIGHT_SIZE1"> + <bitfield name="RESERVED_0" low="19" high="31" type="uint"/> + <bitfield name="WEIGHT_BYTES_PER_KERNEL" low="0" high="18" type="uint"/> + </reg32> + <reg32 offset="0x1038" name="WEIGHT_SIZE2"> + <bitfield name="RESERVED_0" low="29" high="31" type="uint"/> + <bitfield name="WEIGHT_WIDTH" low="24" high="28" type="uint"/> + <bitfield name="RESERVED_1" low="21" high="23" type="uint"/> + <bitfield name="WEIGHT_HEIGHT" low="16" high="20" type="uint"/> + <bitfield name="RESERVED_2" low="14" high="15" type="uint"/> + <bitfield name="WEIGHT_KERNELS" low="0" high="13" type="uint"/> + </reg32> + <reg32 offset="0x1040" name="CBUF_CON0"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="WEIGHT_REUSE" pos="13" type="uint"/> + <bitfield name="DATA_REUSE" pos="12" type="uint"/> + <bitfield name="RESERVED_1" pos="11" type="uint"/> + <bitfield name="FC_DATA_BANK" low="8" high="10" type="uint"/> + <bitfield name="WEIGHT_BANK" low="4" high="7" type="uint"/> + <bitfield name="DATA_BANK" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x1044" name="CBUF_CON1"> + <bitfield name="RESERVED_0" low="14" high="31" type="uint"/> + <bitfield name="DATA_ENTRIES" low="0" high="13" type="uint"/> + </reg32> + <reg32 offset="0x104C" name="CVT_CON0"> + <bitfield name="RESERVED_0" low="28" high="31" type="uint"/> + <bitfield name="CVT_TRUNCATE_3" low="22" high="27" type="uint"/> + <bitfield name="CVT_TRUNCATE_2" low="16" high="21" type="uint"/> + <bitfield name="CVT_TRUNCATE_1" low="10" high="15" type="uint"/> + <bitfield name="CVT_TRUNCATE_0" low="4" high="9" type="uint"/> + <bitfield name="DATA_SIGN" pos="3" type="uint"/> + <bitfield name="ROUND_TYPE" pos="2" type="uint"/> + <bitfield name="CVT_TYPE" pos="1" type="uint"/> + <bitfield name="CVT_BYPASS" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x1050" name="CVT_CON1"> + <bitfield name="CVT_SCALE0" low="16" high="31" type="uint"/> + <bitfield name="CVT_OFFSET0" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x1054" name="CVT_CON2"> + <bitfield name="CVT_SCALE1" low="16" high="31" type="uint"/> + <bitfield name="CVT_OFFSET1" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x1058" name="CVT_CON3"> + <bitfield name="CVT_SCALE2" low="16" high="31" type="uint"/> + <bitfield name="CVT_OFFSET2" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x105C" name="CVT_CON4"> + <bitfield name="CVT_SCALE3" low="16" high="31" type="uint"/> + <bitfield name="CVT_OFFSET3" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x1060" name="FC_CON0"> + <bitfield name="FC_SKIP_DATA" low="16" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="1" high="15" type="uint"/> + <bitfield name="FC_SKIP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x1064" name="FC_CON1"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="DATA_OFFSET" low="0" high="16" type="uint"/> + </reg32> + <reg32 offset="0x1068" name="PAD_CON0"> + <bitfield name="RESERVED_0" low="8" high="31" type="uint"/> + <bitfield name="PAD_LEFT" low="4" high="7" type="uint"/> + <bitfield name="PAD_TOP" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x1070" name="FEATURE_DATA_ADDR"> + <bitfield name="FEATURE_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1074" name="FC_CON2"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="WEIGHT_OFFSET" low="0" high="16" type="uint"/> + </reg32> + <reg32 offset="0x1078" name="DMA_CON0"> + <bitfield name="OV4K_BYPASS" pos="31" type="uint"/> + <bitfield name="RESERVED_0" low="20" high="30" type="uint"/> + <bitfield name="WEIGHT_BURST_LEN" low="16" high="19" type="uint"/> + <bitfield name="RESERVED_1" low="4" high="15" type="uint"/> + <bitfield name="DATA_BURST_LEN" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x107C" name="DMA_CON1"> + <bitfield name="RESERVED_0" low="28" high="31" type="uint"/> + <bitfield name="LINE_STRIDE" low="0" high="27" type="uint"/> + </reg32> + <reg32 offset="0x1080" name="DMA_CON2"> + <bitfield name="RESERVED_0" low="28" high="31" type="uint"/> + <bitfield name="SURF_STRIDE" low="0" high="27" type="uint"/> + </reg32> + <reg32 offset="0x1084" name="FC_DATA_SIZE0"> + <bitfield name="RESERVED_0" low="30" high="31" type="uint"/> + <bitfield name="DMA_WIDTH" low="16" high="29" type="uint"/> + <bitfield name="RESERVED_1" low="11" high="15" type="uint"/> + <bitfield name="DMA_HEIGHT" low="0" high="10" type="uint"/> + </reg32> + <reg32 offset="0x1088" name="FC_DATA_SIZE1"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="DMA_CHANNEL" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x1090" name="CLK_GATE"> + <bitfield name="RESERVED_0" low="5" high="31" type="uint"/> + <bitfield name="CBUF_CS_DISABLE_CLKGATE" pos="4" type="uint"/> + <bitfield name="RESERVED_1" pos="3" type="uint"/> + <bitfield name="CSC_DISABLE_CLKGATE" pos="2" type="uint"/> + <bitfield name="CNA_WEIGHT_DISABLE_CLKGATE" pos="1" type="uint"/> + <bitfield name="CNA_FEATURE_DISABLE_CLKGATE" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x1100" name="DCOMP_CTRL"> + <bitfield name="RESERVED_0" low="4" high="31" type="uint"/> + <bitfield name="WT_DEC_BYPASS" pos="3" type="uint"/> + <bitfield name="DECOMP_CONTROL" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x1104" name="DCOMP_REGNUM"> + <bitfield name="DCOMP_REGNUM" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1110" name="DCOMP_ADDR0"> + <bitfield name="DECOMPRESS_ADDR0" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1140" name="DCOMP_AMOUNT0"> + <bitfield name="DCOMP_AMOUNT0" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1144" name="DCOMP_AMOUNT1"> + <bitfield name="DCOMP_AMOUNT1" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1148" name="DCOMP_AMOUNT2"> + <bitfield name="DCOMP_AMOUNT2" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x114C" name="DCOMP_AMOUNT3"> + <bitfield name="DCOMP_AMOUNT3" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1150" name="DCOMP_AMOUNT4"> + <bitfield name="DCOMP_AMOUNT4" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1154" name="DCOMP_AMOUNT5"> + <bitfield name="DCOMP_AMOUNT5" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1158" name="DCOMP_AMOUNT6"> + <bitfield name="DCOMP_AMOUNT6" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x115C" name="DCOMP_AMOUNT7"> + <bitfield name="DCOMP_AMOUNT7" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1160" name="DCOMP_AMOUNT8"> + <bitfield name="DCOMP_AMOUNT8" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1164" name="DCOMP_AMOUNT9"> + <bitfield name="DCOMP_AMOUNT9" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1168" name="DCOMP_AMOUNT10"> + <bitfield name="DCOMP_AMOUNT10" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x116C" name="DCOMP_AMOUNT11"> + <bitfield name="DCOMP_AMOUNT11" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1170" name="DCOMP_AMOUNT12"> + <bitfield name="DCOMP_AMOUNT12" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1174" name="DCOMP_AMOUNT13"> + <bitfield name="DCOMP_AMOUNT13" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1178" name="DCOMP_AMOUNT14"> + <bitfield name="DCOMP_AMOUNT14" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x117C" name="DCOMP_AMOUNT15"> + <bitfield name="DCOMP_AMOUNT15" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1180" name="CVT_CON5"> + <bitfield name="PER_CHANNEL_CVT_EN" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x1184" name="PAD_CON1"> + <bitfield name="PAD_VALUE" low="0" high="31" type="uint"/> + </reg32> +</domain> +<domain name="CORE" width="32"> + <reg32 offset="0x3000" name="S_STATUS"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="STATUS_1" low="16" high="17" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="15" type="uint"/> + <bitfield name="STATUS_0" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x3004" name="S_POINTER"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="EXECUTER" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="6" high="15" type="uint"/> + <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/> + <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/> + <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/> + <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/> + <bitfield name="POINTER_PP_EN" pos="1" type="uint"/> + <bitfield name="POINTER" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x3008" name="OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x300C" name="MAC_GATING"> + <bitfield name="RESERVED_0" low="27" high="31" type="uint"/> + <bitfield name="SLCG_OP_EN" low="0" high="26" type="uint"/> + </reg32> + <reg32 offset="0x3010" name="MISC_CFG"> + <bitfield name="RESERVED_0" low="20" high="31" type="uint"/> + <bitfield name="SOFT_GATING" low="14" high="19" type="uint"/> + <bitfield name="RESERVED_1" low="11" high="13" type="uint"/> + <bitfield name="PROC_PRECISION" low="8" high="10" type="uint"/> + <bitfield name="RESERVED_2" low="2" high="7" type="uint"/> + <bitfield name="DW_EN" pos="1" type="uint"/> + <bitfield name="QD_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x3014" name="DATAOUT_SIZE_0"> + <bitfield name="DATAOUT_HEIGHT" low="16" high="31" type="uint"/> + <bitfield name="DATAOUT_WIDTH" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x3018" name="DATAOUT_SIZE_1"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="DATAOUT_CHANNEL" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x301C" name="CLIP_TRUNCATE"> + <bitfield name="RESERVED_0" low="7" high="31" type="uint"/> + <bitfield name="ROUND_TYPE" pos="6" type="uint"/> + <bitfield name="RESERVED_1" pos="5" type="uint"/> + <bitfield name="CLIP_TRUNCATE" low="0" high="4" type="uint"/> + </reg32> +</domain> +<domain name="DPU" width="32"> + <reg32 offset="0x4000" name="S_STATUS"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="STATUS_1" low="16" high="17" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="15" type="uint"/> + <bitfield name="STATUS_0" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x4004" name="S_POINTER"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="EXECUTER" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="6" high="15" type="uint"/> + <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/> + <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/> + <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/> + <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/> + <bitfield name="POINTER_PP_EN" pos="1" type="uint"/> + <bitfield name="POINTER" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x4008" name="OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x400C" name="FEATURE_MODE_CFG"> + <bitfield name="COMB_USE" pos="31" type="uint"/> + <bitfield name="TP_EN" pos="30" type="uint"/> + <bitfield name="RGP_TYPE" low="26" high="29" type="uint"/> + <bitfield name="NONALIGN" pos="25" type="uint"/> + <bitfield name="SURF_LEN" low="9" high="24" type="uint"/> + <bitfield name="BURST_LEN" low="5" high="8" type="uint"/> + <bitfield name="CONV_MODE" low="3" high="4" type="uint"/> + <bitfield name="OUTPUT_MODE" low="1" high="2" type="uint"/> + <bitfield name="FLYING_MODE" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x4010" name="DATA_FORMAT"> + <bitfield name="OUT_PRECISION" low="29" high="31" type="uint"/> + <bitfield name="IN_PRECISION" low="26" high="28" type="uint"/> + <bitfield name="EW_TRUNCATE_NEG" low="16" high="25" type="uint"/> + <bitfield name="BN_MUL_SHIFT_VALUE_NEG" low="10" high="15" type="uint"/> + <bitfield name="BS_MUL_SHIFT_VALUE_NEG" low="4" high="9" type="uint"/> + <bitfield name="MC_SURF_OUT" pos="3" type="uint"/> + <bitfield name="PROC_PRECISION" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x4014" name="OFFSET_PEND"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="OFFSET_PEND" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x4020" name="DST_BASE_ADDR"> + <bitfield name="DST_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4024" name="DST_SURF_STRIDE"> + <bitfield name="DST_SURF_STRIDE" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x4030" name="DATA_CUBE_WIDTH"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="WIDTH" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x4034" name="DATA_CUBE_HEIGHT"> + <bitfield name="RESERVED_0" low="25" high="31" type="uint"/> + <bitfield name="MINMAX_CTL" low="22" high="24" type="uint"/> + <bitfield name="RESERVED_1" low="13" high="21" type="uint"/> + <bitfield name="HEIGHT" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x4038" name="DATA_CUBE_NOTCH_ADDR"> + <bitfield name="RESERVED_0" low="29" high="31" type="uint"/> + <bitfield name="NOTCH_ADDR_1" low="16" high="28" type="uint"/> + <bitfield name="RESERVED_1" low="13" high="15" type="uint"/> + <bitfield name="NOTCH_ADDR_0" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x403C" name="DATA_CUBE_CHANNEL"> + <bitfield name="RESERVED_0" low="29" high="31" type="uint"/> + <bitfield name="ORIG_CHANNEL" low="16" high="28" type="uint"/> + <bitfield name="RESERVED_1" low="13" high="15" type="uint"/> + <bitfield name="CHANNEL" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x4040" name="BS_CFG"> + <bitfield name="RESERVED_0" low="20" high="31" type="uint"/> + <bitfield name="BS_ALU_ALGO" low="16" high="19" type="uint"/> + <bitfield name="RESERVED_1" low="9" high="15" type="uint"/> + <bitfield name="BS_ALU_SRC" pos="8" type="uint"/> + <bitfield name="BS_RELUX_EN" pos="7" type="uint"/> + <bitfield name="BS_RELU_BYPASS" pos="6" type="uint"/> + <bitfield name="BS_MUL_PRELU" pos="5" type="uint"/> + <bitfield name="BS_MUL_BYPASS" pos="4" type="uint"/> + <bitfield name="RESERVED_2" low="2" high="3" type="uint"/> + <bitfield name="BS_ALU_BYPASS" pos="1" type="uint"/> + <bitfield name="BS_BYPASS" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x4044" name="BS_ALU_CFG"> + <bitfield name="BS_ALU_OPERAND" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4048" name="BS_MUL_CFG"> + <bitfield name="BS_MUL_OPERAND" low="16" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="14" high="15" type="uint"/> + <bitfield name="BS_MUL_SHIFT_VALUE" low="8" high="13" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="7" type="uint"/> + <bitfield name="BS_TRUNCATE_SRC" pos="1" type="uint"/> + <bitfield name="BS_MUL_SRC" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x404C" name="BS_RELUX_CMP_VALUE"> + <bitfield name="BS_RELUX_CMP_DAT" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4050" name="BS_OW_CFG"> + <bitfield name="RGP_CNTER" low="28" high="31" type="uint"/> + <bitfield name="TP_ORG_EN" pos="27" type="uint"/> + <bitfield name="RESERVED_0" low="11" high="26" type="uint"/> + <bitfield name="SIZE_E_2" low="8" high="10" type="uint"/> + <bitfield name="SIZE_E_1" low="5" high="7" type="uint"/> + <bitfield name="SIZE_E_0" low="2" high="4" type="uint"/> + <bitfield name="OD_BYPASS" pos="1" type="uint"/> + <bitfield name="OW_SRC" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x4054" name="BS_OW_OP"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="OW_OP" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x4058" name="WDMA_SIZE_0"> + <bitfield name="RESERVED_0" low="28" high="31" type="uint"/> + <bitfield name="TP_PRECISION" pos="27" type="uint"/> + <bitfield name="SIZE_C_WDMA" low="16" high="26" type="uint"/> + <bitfield name="RESERVED_1" low="13" high="15" type="uint"/> + <bitfield name="CHANNEL_WDMA" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x405C" name="WDMA_SIZE_1"> + <bitfield name="RESERVED_0" low="29" high="31" type="uint"/> + <bitfield name="HEIGHT_WDMA" low="16" high="28" type="uint"/> + <bitfield name="RESERVED_1" low="13" high="15" type="uint"/> + <bitfield name="WIDTH_WDMA" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x4060" name="BN_CFG"> + <bitfield name="RESERVED_0" low="20" high="31" type="uint"/> + <bitfield name="BN_ALU_ALGO" low="16" high="19" type="uint"/> + <bitfield name="RESERVED_1" low="9" high="15" type="uint"/> + <bitfield name="BN_ALU_SRC" pos="8" type="uint"/> + <bitfield name="BN_RELUX_EN" pos="7" type="uint"/> + <bitfield name="BN_RELU_BYPASS" pos="6" type="uint"/> + <bitfield name="BN_MUL_PRELU" pos="5" type="uint"/> + <bitfield name="BN_MUL_BYPASS" pos="4" type="uint"/> + <bitfield name="RESERVED_2" low="2" high="3" type="uint"/> + <bitfield name="BN_ALU_BYPASS" pos="1" type="uint"/> + <bitfield name="BN_BYPASS" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x4064" name="BN_ALU_CFG"> + <bitfield name="BN_ALU_OPERAND" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4068" name="BN_MUL_CFG"> + <bitfield name="BN_MUL_OPERAND" low="16" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="14" high="15" type="uint"/> + <bitfield name="BN_MUL_SHIFT_VALUE" low="8" high="13" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="7" type="uint"/> + <bitfield name="BN_TRUNCATE_SRC" pos="1" type="uint"/> + <bitfield name="BN_MUL_SRC" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x406C" name="BN_RELUX_CMP_VALUE"> + <bitfield name="BN_RELUX_CMP_DAT" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4070" name="EW_CFG"> + <bitfield name="EW_CVT_TYPE" pos="31" type="uint"/> + <bitfield name="EW_CVT_ROUND" pos="30" type="uint"/> + <bitfield name="EW_DATA_MODE" low="28" high="29" type="uint"/> + <bitfield name="RESERVED_0" low="24" high="27" type="uint"/> + <bitfield name="EDATA_SIZE" low="22" high="23" type="uint"/> + <bitfield name="EW_EQUAL_EN" pos="21" type="uint"/> + <bitfield name="EW_BINARY_EN" pos="20" type="uint"/> + <bitfield name="EW_ALU_ALGO" low="16" high="19" type="uint"/> + <bitfield name="RESERVED_1" low="11" high="15" type="uint"/> + <bitfield name="EW_RELUX_EN" pos="10" type="uint"/> + <bitfield name="EW_RELU_BYPASS" pos="9" type="uint"/> + <bitfield name="EW_OP_CVT_BYPASS" pos="8" type="uint"/> + <bitfield name="EW_LUT_BYPASS" pos="7" type="uint"/> + <bitfield name="EW_OP_SRC" pos="6" type="uint"/> + <bitfield name="EW_MUL_PRELU" pos="5" type="uint"/> + <bitfield name="RESERVED_2" low="3" high="4" type="uint"/> + <bitfield name="EW_OP_TYPE" pos="2" type="uint"/> + <bitfield name="EW_OP_BYPASS" pos="1" type="uint"/> + <bitfield name="EW_BYPASS" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x4074" name="EW_CVT_OFFSET_VALUE"> + <bitfield name="EW_OP_CVT_OFFSET" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4078" name="EW_CVT_SCALE_VALUE"> + <bitfield name="EW_TRUNCATE" low="22" high="31" type="uint"/> + <bitfield name="EW_OP_CVT_SHIFT" low="16" high="21" type="uint"/> + <bitfield name="EW_OP_CVT_SCALE" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x407C" name="EW_RELUX_CMP_VALUE"> + <bitfield name="EW_RELUX_CMP_DAT" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4080" name="OUT_CVT_OFFSET"> + <bitfield name="OUT_CVT_OFFSET" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4084" name="OUT_CVT_SCALE"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="FP32TOFP16_EN" pos="16" type="uint"/> + <bitfield name="OUT_CVT_SCALE" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x4088" name="OUT_CVT_SHIFT"> + <bitfield name="CVT_TYPE" pos="31" type="uint"/> + <bitfield name="CVT_ROUND" pos="30" type="uint"/> + <bitfield name="RESERVED_0" low="20" high="29" type="uint"/> + <bitfield name="MINUS_EXP" low="12" high="19" type="uint"/> + <bitfield name="OUT_CVT_SHIFT" low="0" high="11" type="uint"/> + </reg32> + <reg32 offset="0x4090" name="EW_OP_VALUE_0"> + <bitfield name="EW_OPERAND_0" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4094" name="EW_OP_VALUE_1"> + <bitfield name="EW_OPERAND_1" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4098" name="EW_OP_VALUE_2"> + <bitfield name="EW_OPERAND_2" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x409C" name="EW_OP_VALUE_3"> + <bitfield name="EW_OPERAND_3" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x40A0" name="EW_OP_VALUE_4"> + <bitfield name="EW_OPERAND_4" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x40A4" name="EW_OP_VALUE_5"> + <bitfield name="EW_OPERAND_5" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x40A8" name="EW_OP_VALUE_6"> + <bitfield name="EW_OPERAND_6" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x40AC" name="EW_OP_VALUE_7"> + <bitfield name="EW_OPERAND_7" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x40C0" name="SURFACE_ADD"> + <bitfield name="SURF_ADD" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x4100" name="LUT_ACCESS_CFG"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="LUT_ACCESS_TYPE" pos="17" type="uint"/> + <bitfield name="LUT_TABLE_ID" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="10" high="15" type="uint"/> + <bitfield name="LUT_ADDR" low="0" high="9" type="uint"/> + </reg32> + <reg32 offset="0x4104" name="LUT_ACCESS_DATA"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="LUT_ACCESS_DATA" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x4108" name="LUT_CFG"> + <bitfield name="RESERVED_0" low="8" high="31" type="uint"/> + <bitfield name="LUT_CAL_SEL" pos="7" type="uint"/> + <bitfield name="LUT_HYBRID_PRIORITY" pos="6" type="uint"/> + <bitfield name="LUT_OFLOW_PRIORITY" pos="5" type="uint"/> + <bitfield name="LUT_UFLOW_PRIORITY" pos="4" type="uint"/> + <bitfield name="LUT_LO_LE_MUX" low="2" high="3" type="uint"/> + <bitfield name="LUT_EXPAND_EN" pos="1" type="uint"/> + <bitfield name="LUT_ROAD_SEL" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x410C" name="LUT_INFO"> + <bitfield name="RESERVED_0" low="24" high="31" type="uint"/> + <bitfield name="LUT_LO_INDEX_SELECT" low="16" high="23" type="uint"/> + <bitfield name="LUT_LE_INDEX_SELECT" low="8" high="15" type="uint"/> + <bitfield name="RESERVED_1" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x4110" name="LUT_LE_START"> + <bitfield name="LUT_LE_START" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4114" name="LUT_LE_END"> + <bitfield name="LUT_LE_END" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4118" name="LUT_LO_START"> + <bitfield name="LUT_LO_START" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x411C" name="LUT_LO_END"> + <bitfield name="LUT_LO_END" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x4120" name="LUT_LE_SLOPE_SCALE"> + <bitfield name="LUT_LE_SLOPE_OFLOW_SCALE" low="16" high="31" type="uint"/> + <bitfield name="LUT_LE_SLOPE_UFLOW_SCALE" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x4124" name="LUT_LE_SLOPE_SHIFT"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="LUT_LE_SLOPE_OFLOW_SHIFT" low="5" high="9" type="uint"/> + <bitfield name="LUT_LE_SLOPE_UFLOW_SHIFT" low="0" high="4" type="uint"/> + </reg32> + <reg32 offset="0x4128" name="LUT_LO_SLOPE_SCALE"> + <bitfield name="LUT_LO_SLOPE_OFLOW_SCALE" low="16" high="31" type="uint"/> + <bitfield name="LUT_LO_SLOPE_UFLOW_SCALE" low="0" high="15" type="uint"/> + </reg32> + <reg32 offset="0x412C" name="LUT_LO_SLOPE_SHIFT"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="LUT_LO_SLOPE_OFLOW_SHIFT" low="5" high="9" type="uint"/> + <bitfield name="LUT_LO_SLOPE_UFLOW_SHIFT" low="0" high="4" type="uint"/> + </reg32> +</domain> +<domain name="DPU_RDMA" width="32"> + <reg32 offset="0x5000" name="RDMA_S_STATUS"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="STATUS_1" low="16" high="17" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="15" type="uint"/> + <bitfield name="STATUS_0" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x5004" name="RDMA_S_POINTER"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="EXECUTER" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="6" high="15" type="uint"/> + <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/> + <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/> + <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/> + <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/> + <bitfield name="POINTER_PP_EN" pos="1" type="uint"/> + <bitfield name="POINTER" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x5008" name="RDMA_OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x500C" name="RDMA_DATA_CUBE_WIDTH"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="WIDTH" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x5010" name="RDMA_DATA_CUBE_HEIGHT"> + <bitfield name="RESERVED_0" low="29" high="31" type="uint"/> + <bitfield name="EW_LINE_NOTCH_ADDR" low="16" high="28" type="uint"/> + <bitfield name="RESERVED_1" low="13" high="15" type="uint"/> + <bitfield name="HEIGHT" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x5014" name="RDMA_DATA_CUBE_CHANNEL"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CHANNEL" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x5018" name="RDMA_SRC_BASE_ADDR"> + <bitfield name="SRC_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x501C" name="RDMA_BRDMA_CFG"> + <bitfield name="RESERVED_0" low="5" high="31" type="uint"/> + <bitfield name="BRDMA_DATA_USE" low="1" high="4" type="uint"/> + <bitfield name="RESERVED_1" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x5020" name="RDMA_BS_BASE_ADDR"> + <bitfield name="BS_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x5028" name="RDMA_NRDMA_CFG"> + <bitfield name="RESERVED_0" low="5" high="31" type="uint"/> + <bitfield name="NRDMA_DATA_USE" low="1" high="4" type="uint"/> + <bitfield name="RESERVED_1" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x502C" name="RDMA_BN_BASE_ADDR"> + <bitfield name="BN_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x5034" name="RDMA_ERDMA_CFG"> + <bitfield name="ERDMA_DATA_MODE" low="30" high="31" type="uint"/> + <bitfield name="ERDMA_SURF_MODE" pos="29" type="uint"/> + <bitfield name="ERDMA_NONALIGN" pos="28" type="uint"/> + <bitfield name="RESERVED_0" low="4" high="27" type="uint"/> + <bitfield name="ERDMA_DATA_SIZE" low="2" high="3" type="uint"/> + <bitfield name="OV4K_BYPASS" pos="1" type="uint"/> + <bitfield name="ERDMA_DISABLE" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x5038" name="RDMA_EW_BASE_ADDR"> + <bitfield name="EW_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x5040" name="RDMA_EW_SURF_STRIDE"> + <bitfield name="EW_SURF_STRIDE" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x5044" name="RDMA_FEATURE_MODE_CFG"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="IN_PRECISION" low="15" high="17" type="uint"/> + <bitfield name="BURST_LEN" low="11" high="14" type="uint"/> + <bitfield name="COMB_USE" low="8" high="10" type="uint"/> + <bitfield name="PROC_PRECISION" low="5" high="7" type="uint"/> + <bitfield name="MRDMA_DISABLE" pos="4" type="uint"/> + <bitfield name="MRDMA_FP16TOFP32_EN" pos="3" type="uint"/> + <bitfield name="CONV_MODE" low="1" high="2" type="uint"/> + <bitfield name="FLYING_MODE" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x5048" name="RDMA_SRC_DMA_CFG"> + <bitfield name="LINE_NOTCH_ADDR" low="19" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="14" high="18" type="uint"/> + <bitfield name="POOLING_METHOD" pos="13" type="uint"/> + <bitfield name="UNPOOLING_EN" pos="12" type="uint"/> + <bitfield name="KERNEL_STRIDE_HEIGHT" low="9" high="11" type="uint"/> + <bitfield name="KERNEL_STRIDE_WIDTH" low="6" high="8" type="uint"/> + <bitfield name="KERNEL_HEIGHT" low="3" high="5" type="uint"/> + <bitfield name="KERNEL_WIDTH" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x504C" name="RDMA_SURF_NOTCH"> + <bitfield name="SURF_NOTCH_ADDR" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x5064" name="RDMA_PAD_CFG"> + <bitfield name="PAD_VALUE" low="16" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="7" high="15" type="uint"/> + <bitfield name="PAD_TOP" low="4" high="6" type="uint"/> + <bitfield name="RESERVED_1" pos="3" type="uint"/> + <bitfield name="PAD_LEFT" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x5068" name="RDMA_WEIGHT"> + <bitfield name="E_WEIGHT" low="24" high="31" type="uint"/> + <bitfield name="N_WEIGHT" low="16" high="23" type="uint"/> + <bitfield name="B_WEIGHT" low="8" high="15" type="uint"/> + <bitfield name="M_WEIGHT" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x506C" name="RDMA_EW_SURF_NOTCH"> + <bitfield name="EW_SURF_NOTCH" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> +</domain> +<domain name="PPU" width="32"> + <reg32 offset="0x6000" name="S_STATUS"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="STATUS_1" low="16" high="17" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="15" type="uint"/> + <bitfield name="STATUS_0" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x6004" name="S_POINTER"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="EXECUTER" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="6" high="15" type="uint"/> + <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/> + <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/> + <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/> + <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/> + <bitfield name="POINTER_PP_EN" pos="1" type="uint"/> + <bitfield name="POINTER" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x6008" name="OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x600C" name="DATA_CUBE_IN_WIDTH"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_IN_WIDTH" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x6010" name="DATA_CUBE_IN_HEIGHT"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_IN_HEIGHT" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x6014" name="DATA_CUBE_IN_CHANNEL"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_IN_CHANNEL" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x6018" name="DATA_CUBE_OUT_WIDTH"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_OUT_WIDTH" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x601C" name="DATA_CUBE_OUT_HEIGHT"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_OUT_HEIGHT" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x6020" name="DATA_CUBE_OUT_CHANNEL"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_OUT_CHANNEL" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x6024" name="OPERATION_MODE_CFG"> + <bitfield name="RESERVED_0" pos="31" type="uint"/> + <bitfield name="INDEX_EN" pos="30" type="uint"/> + <bitfield name="RESERVED_1" pos="29" type="uint"/> + <bitfield name="NOTCH_ADDR" low="16" high="28" type="uint"/> + <bitfield name="RESERVED_2" low="8" high="15" type="uint"/> + <bitfield name="USE_CNT" low="5" high="7" type="uint"/> + <bitfield name="FLYING_MODE" pos="4" type="uint"/> + <bitfield name="RESERVED_3" low="2" high="3" type="uint"/> + <bitfield name="POOLING_METHOD" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x6034" name="POOLING_KERNEL_CFG"> + <bitfield name="RESERVED_0" low="24" high="31" type="uint"/> + <bitfield name="KERNEL_STRIDE_HEIGHT" low="20" high="23" type="uint"/> + <bitfield name="KERNEL_STRIDE_WIDTH" low="16" high="19" type="uint"/> + <bitfield name="RESERVED_1" low="12" high="15" type="uint"/> + <bitfield name="KERNEL_HEIGHT" low="8" high="11" type="uint"/> + <bitfield name="RESERVED_2" low="4" high="7" type="uint"/> + <bitfield name="KERNEL_WIDTH" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x6038" name="RECIP_KERNEL_WIDTH"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="RECIP_KERNEL_WIDTH" low="0" high="16" type="uint"/> + </reg32> + <reg32 offset="0x603C" name="RECIP_KERNEL_HEIGHT"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="RECIP_KERNEL_HEIGHT" low="0" high="16" type="uint"/> + </reg32> + <reg32 offset="0x6040" name="POOLING_PADDING_CFG"> + <bitfield name="RESERVED_0" low="15" high="31" type="uint"/> + <bitfield name="PAD_BOTTOM" low="12" high="14" type="uint"/> + <bitfield name="RESERVED_1" pos="11" type="uint"/> + <bitfield name="PAD_RIGHT" low="8" high="10" type="uint"/> + <bitfield name="RESERVED_2" pos="7" type="uint"/> + <bitfield name="PAD_TOP" low="4" high="6" type="uint"/> + <bitfield name="RESERVED_3" pos="3" type="uint"/> + <bitfield name="PAD_LEFT" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x6044" name="PADDING_VALUE_1_CFG"> + <bitfield name="PAD_VALUE_0" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x6048" name="PADDING_VALUE_2_CFG"> + <bitfield name="RESERVED_0" low="3" high="31" type="uint"/> + <bitfield name="PAD_VALUE_1" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x6070" name="DST_BASE_ADDR"> + <bitfield name="DST_BASE_ADDR" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x607C" name="DST_SURF_STRIDE"> + <bitfield name="DST_SURF_STRIDE" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x6084" name="DATA_FORMAT"> + <bitfield name="INDEX_ADD" low="4" high="31" type="uint"/> + <bitfield name="DPU_FLYIN" pos="3" type="uint"/> + <bitfield name="PROC_PRECISION" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x60DC" name="MISC_CTRL"> + <bitfield name="SURF_LEN" low="16" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="9" high="15" type="uint"/> + <bitfield name="MC_SURF_OUT" pos="8" type="uint"/> + <bitfield name="NONALIGN" pos="7" type="uint"/> + <bitfield name="RESERVED_1" low="4" high="6" type="uint"/> + <bitfield name="BURST_LEN" low="0" high="3" type="uint"/> + </reg32> +</domain> +<domain name="PPU_RDMA" width="32"> + <reg32 offset="0x7000" name="RDMA_S_STATUS"> + <bitfield name="RESERVED_0" low="18" high="31" type="uint"/> + <bitfield name="STATUS_1" low="16" high="17" type="uint"/> + <bitfield name="RESERVED_1" low="2" high="15" type="uint"/> + <bitfield name="STATUS_0" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x7004" name="RDMA_S_POINTER"> + <bitfield name="RESERVED_0" low="17" high="31" type="uint"/> + <bitfield name="EXECUTER" pos="16" type="uint"/> + <bitfield name="RESERVED_1" low="6" high="15" type="uint"/> + <bitfield name="EXECUTER_PP_CLEAR" pos="5" type="uint"/> + <bitfield name="POINTER_PP_CLEAR" pos="4" type="uint"/> + <bitfield name="POINTER_PP_MODE" pos="3" type="uint"/> + <bitfield name="EXECUTER_PP_EN" pos="2" type="uint"/> + <bitfield name="POINTER_PP_EN" pos="1" type="uint"/> + <bitfield name="POINTER" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x7008" name="RDMA_OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="OP_EN" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x700C" name="RDMA_CUBE_IN_WIDTH"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_IN_WIDTH" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x7010" name="RDMA_CUBE_IN_HEIGHT"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_IN_HEIGHT" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x7014" name="RDMA_CUBE_IN_CHANNEL"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="CUBE_IN_CHANNEL" low="0" high="12" type="uint"/> + </reg32> + <reg32 offset="0x701C" name="RDMA_SRC_BASE_ADDR"> + <bitfield name="SRC_BASE_ADDR" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x7024" name="RDMA_SRC_LINE_STRIDE"> + <bitfield name="SRC_LINE_STRIDE" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x7028" name="RDMA_SRC_SURF_STRIDE"> + <bitfield name="SRC_SURF_STRIDE" low="4" high="31" type="uint"/> + <bitfield name="RESERVED_0" low="0" high="3" type="uint"/> + </reg32> + <reg32 offset="0x7030" name="RDMA_DATA_FORMAT"> + <bitfield name="RESERVED_0" low="2" high="31" type="uint"/> + <bitfield name="IN_PRECISION" low="0" high="1" type="uint"/> + </reg32> +</domain> +<domain name="DDMA" width="32"> + <reg32 offset="0x8000" name="CFG_OUTSTANDING"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="WR_OS_CNT" low="8" high="15" type="uint"/> + <bitfield name="RD_OS_CNT" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x8004" name="RD_WEIGHT_0"> + <bitfield name="RD_WEIGHT_PDP" low="24" high="31" type="uint"/> + <bitfield name="RD_WEIGHT_DPU" low="16" high="23" type="uint"/> + <bitfield name="RD_WEIGHT_KERNEL" low="8" high="15" type="uint"/> + <bitfield name="RD_WEIGHT_FEATURE" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x8008" name="WR_WEIGHT_0"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="WR_WEIGHT_PDP" low="8" high="15" type="uint"/> + <bitfield name="WR_WEIGHT_DPU" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x800C" name="CFG_ID_ERROR"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="WR_RESP_ID" low="6" high="9" type="uint"/> + <bitfield name="RESERVED_1" pos="5" type="uint"/> + <bitfield name="RD_RESP_ID" low="0" high="4" type="uint"/> + </reg32> + <reg32 offset="0x8010" name="RD_WEIGHT_1"> + <bitfield name="RESERVED_0" low="8" high="31" type="uint"/> + <bitfield name="RD_WEIGHT_PC" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x8014" name="CFG_DMA_FIFO_CLR"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="DMA_FIFO_CLR" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x8018" name="CFG_DMA_ARB"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="WR_ARBIT_MODEL" pos="9" type="uint"/> + <bitfield name="RD_ARBIT_MODEL" pos="8" type="uint"/> + <bitfield name="RESERVED_1" pos="7" type="uint"/> + <bitfield name="WR_FIX_ARB" low="4" high="6" type="uint"/> + <bitfield name="RESERVED_2" pos="3" type="uint"/> + <bitfield name="RD_FIX_ARB" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x8020" name="CFG_DMA_RD_QOS"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="RD_PC_QOS" low="8" high="9" type="uint"/> + <bitfield name="RD_PPU_QOS" low="6" high="7" type="uint"/> + <bitfield name="RD_DPU_QOS" low="4" high="5" type="uint"/> + <bitfield name="RD_KERNEL_QOS" low="2" high="3" type="uint"/> + <bitfield name="RD_FEATURE_QOS" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x8024" name="CFG_DMA_RD_CFG"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="RD_ARLOCK" pos="12" type="uint"/> + <bitfield name="RD_ARCACHE" low="8" high="11" type="uint"/> + <bitfield name="RD_ARPROT" low="5" high="7" type="uint"/> + <bitfield name="RD_ARBURST" low="3" high="4" type="uint"/> + <bitfield name="RD_ARSIZE" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x8028" name="CFG_DMA_WR_CFG"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="WR_AWLOCK" pos="12" type="uint"/> + <bitfield name="WR_AWCACHE" low="8" high="11" type="uint"/> + <bitfield name="WR_AWPROT" low="5" high="7" type="uint"/> + <bitfield name="WR_AWBURST" low="3" high="4" type="uint"/> + <bitfield name="WR_AWSIZE" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x802C" name="CFG_DMA_WSTRB"> + <bitfield name="WR_WSTRB" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x8030" name="CFG_STATUS"> + <bitfield name="RESERVED_0" low="9" high="31" type="uint"/> + <bitfield name="IDEL" pos="8" type="uint"/> + <bitfield name="RESERVED_1" low="0" high="7" type="uint"/> + </reg32> +</domain> +<domain name="SDMA" width="32"> + <reg32 offset="0x9000" name="CFG_OUTSTANDING"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="WR_OS_CNT" low="8" high="15" type="uint"/> + <bitfield name="RD_OS_CNT" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x9004" name="RD_WEIGHT_0"> + <bitfield name="RD_WEIGHT_PDP" low="24" high="31" type="uint"/> + <bitfield name="RD_WEIGHT_DPU" low="16" high="23" type="uint"/> + <bitfield name="RD_WEIGHT_KERNEL" low="8" high="15" type="uint"/> + <bitfield name="RD_WEIGHT_FEATURE" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x9008" name="WR_WEIGHT_0"> + <bitfield name="RESERVED_0" low="16" high="31" type="uint"/> + <bitfield name="WR_WEIGHT_PDP" low="8" high="15" type="uint"/> + <bitfield name="WR_WEIGHT_DPU" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x900C" name="CFG_ID_ERROR"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="WR_RESP_ID" low="6" high="9" type="uint"/> + <bitfield name="RESERVED_1" pos="5" type="uint"/> + <bitfield name="RD_RESP_ID" low="0" high="4" type="uint"/> + </reg32> + <reg32 offset="0x9010" name="RD_WEIGHT_1"> + <bitfield name="RESERVED_0" low="8" high="31" type="uint"/> + <bitfield name="RD_WEIGHT_PC" low="0" high="7" type="uint"/> + </reg32> + <reg32 offset="0x9014" name="CFG_DMA_FIFO_CLR"> + <bitfield name="RESERVED_0" low="1" high="31" type="uint"/> + <bitfield name="DMA_FIFO_CLR" pos="0" type="uint"/> + </reg32> + <reg32 offset="0x9018" name="CFG_DMA_ARB"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="WR_ARBIT_MODEL" pos="9" type="uint"/> + <bitfield name="RD_ARBIT_MODEL" pos="8" type="uint"/> + <bitfield name="RESERVED_1" pos="7" type="uint"/> + <bitfield name="WR_FIX_ARB" low="4" high="6" type="uint"/> + <bitfield name="RESERVED_2" pos="3" type="uint"/> + <bitfield name="RD_FIX_ARB" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x9020" name="CFG_DMA_RD_QOS"> + <bitfield name="RESERVED_0" low="10" high="31" type="uint"/> + <bitfield name="RD_PC_QOS" low="8" high="9" type="uint"/> + <bitfield name="RD_PPU_QOS" low="6" high="7" type="uint"/> + <bitfield name="RD_DPU_QOS" low="4" high="5" type="uint"/> + <bitfield name="RD_KERNEL_QOS" low="2" high="3" type="uint"/> + <bitfield name="RD_FEATURE_QOS" low="0" high="1" type="uint"/> + </reg32> + <reg32 offset="0x9024" name="CFG_DMA_RD_CFG"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="RD_ARLOCK" pos="12" type="uint"/> + <bitfield name="RD_ARCACHE" low="8" high="11" type="uint"/> + <bitfield name="RD_ARPROT" low="5" high="7" type="uint"/> + <bitfield name="RD_ARBURST" low="3" high="4" type="uint"/> + <bitfield name="RD_ARSIZE" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x9028" name="CFG_DMA_WR_CFG"> + <bitfield name="RESERVED_0" low="13" high="31" type="uint"/> + <bitfield name="WR_AWLOCK" pos="12" type="uint"/> + <bitfield name="WR_AWCACHE" low="8" high="11" type="uint"/> + <bitfield name="WR_AWPROT" low="5" high="7" type="uint"/> + <bitfield name="WR_AWBURST" low="3" high="4" type="uint"/> + <bitfield name="WR_AWSIZE" low="0" high="2" type="uint"/> + </reg32> + <reg32 offset="0x902C" name="CFG_DMA_WSTRB"> + <bitfield name="WR_WSTRB" low="0" high="31" type="uint"/> + </reg32> + <reg32 offset="0x9030" name="CFG_STATUS"> + <bitfield name="RESERVED_0" low="9" high="31" type="uint"/> + <bitfield name="IDEL" pos="8" type="uint"/> + <bitfield name="RESERVED_1" low="0" high="7" type="uint"/> + </reg32> +</domain> +<domain name="GLOBAL" width="32"> + <reg32 offset="0xF008" name="OPERATION_ENABLE"> + <bitfield name="RESERVED_0" low="7" high="31" type="uint"/> + <bitfield name="PPU_RDMA_OP_EN" pos="6" type="uint"/> + <bitfield name="PPU_OP_EN" pos="5" type="uint"/> + <bitfield name="DPU_RDMA_OP_EN" pos="4" type="uint"/> + <bitfield name="DPU_OP_EN" pos="3" type="uint"/> + <bitfield name="CORE_OP_EN" pos="2" type="uint"/> + <bitfield name="RESERVED_1" pos="1" type="uint"/> + <bitfield name="CNA_OP_EN" pos="0" type="uint"/> + </reg32> +</domain> + +</database> diff --git a/src/gallium/drivers/rocket/rkt_coefs.c b/src/gallium/drivers/rocket/rkt_coefs.c new file mode 100644 index 00000000000..82258e70aaa --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_coefs.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include "util/u_inlines.h" + +#include "rkt_coefs.h" +#include "rkt_ml.h" + +struct pipe_resource * +rkt_fill_weights(struct rkt_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation) +{ + struct pipe_context *pcontext = subgraph->base.context; + unsigned weights_width = poperation->conv.weight_tensor->dims[1]; + unsigned weights_height = poperation->conv.weight_tensor->dims[2]; + unsigned input_channels = poperation->input_tensors[0]->dims[3]; + unsigned input_channels_real = poperation->input_tensors[0]->dims[3]; + unsigned output_channels = poperation->output_tensors[0]->dims[3]; + unsigned output_channels_real = poperation->output_tensors[0]->dims[3]; + unsigned weights_size; + uint8_t zero_point = poperation->conv.weight_tensor->zero_point; + struct pipe_transfer *transfer_in, *transfer_out; + void *map = + pipe_buffer_map(pcontext, poperation->conv.weight_tensor->resource, + PIPE_MAP_READ, &transfer_in); + uint8_t(*weights_in)[weights_width][weights_height][input_channels] = map; + struct pipe_resource *rsc; + uint8_t *weights_out; + + input_channels = MAX2(input_channels, FEATURE_ATOMIC_SIZE); + + output_channels = ALIGN(output_channels, 2); + if (rkt_is_depthwise(poperation)) + output_channels = 1; + + weights_size = weights_width * weights_height * output_channels * + ALIGN(input_channels, WEIGHT_ATOMIC_SIZE) * 2; + + rsc = + pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT, weights_size); + weights_out = pipe_buffer_map(pcontext, rsc, PIPE_MAP_WRITE, &transfer_out); + + unsigned input_channel_groups = WEIGHT_ATOMIC_SIZE; + if (rkt_is_depthwise(poperation)) + input_channel_groups *= 2; + + unsigned input_channels_1 = + DIV_ROUND_UP(input_channels, input_channel_groups); + unsigned input_channels_2 = MIN2(input_channels, input_channel_groups); + + unsigned n = 0; + for (int oc1 = 0; oc1 < DIV_ROUND_UP(output_channels, WEIGHT_ATOMIC_SIZE); + oc1++) { + for (int ic1 = 0; ic1 < input_channels_1; ic1++) { + for (int x = 0; x < weights_width; x++) { + for (int y = 0; y < weights_height; y++) { + for (int oc2 = 0; oc2 < MIN2(output_channels, WEIGHT_ATOMIC_SIZE); + oc2++) { + for (int ic2 = 0; ic2 < input_channels_2; ic2++) { + unsigned oc = oc1 * WEIGHT_ATOMIC_SIZE + oc2; + unsigned ic = ic1 * input_channel_groups + ic2; + if (output_channels_real > 2 && + oc >= ALIGN(output_channels_real, 2)) + continue; + + if (oc >= output_channels_real) + weights_out[n++] = 0x0; + else if (ic >= input_channels_real) { + if (ic2 < 16 || (input_channels_real % 32) > 16) + weights_out[n++] = + zero_point - 0x80; /* TODO: Why is the blob converting to + signed? It should be unsigned. */ + } else + weights_out[n++] = weights_in[oc][x][y][ic] - + 0x80; /* TODO: Why is the blob converting to + signed? It should be unsigned. */ + } + } + } + } + } + } + + if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) { + static int task = 0; + rkt_dump_buffer(weights_out, "weights", 0, task++, 0, weights_size); + } + + pipe_buffer_unmap(pcontext, transfer_out); + + pipe_buffer_unmap(pcontext, transfer_in); + + return rsc; +} + +static int32_t +calculate_bias_correction(struct rkt_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + unsigned oc, void *map) +{ + unsigned input_channels = poperation->input_tensors[0]->dims[3]; + unsigned input_zero_point = poperation->input_tensors[0]->zero_point; + unsigned weights_width = poperation->conv.weight_tensor->dims[1]; + unsigned weights_height = poperation->conv.weight_tensor->dims[2]; + unsigned weight_zero_point = poperation->conv.weight_tensor->zero_point; + uint8_t(*weights)[weights_width][weights_height][input_channels] = map; + + int32_t correction = 0; + if (rkt_is_depthwise(poperation)) { + for (unsigned x = 0; x < weights_width; x++) { + for (unsigned y = 0; y < weights_height; y++) { + correction += (weights[0][x][y][oc] - weight_zero_point) * + (input_zero_point - 0x80); + } + } + } else { + for (unsigned x = 0; x < weights_width; x++) { + for (unsigned y = 0; y < weights_height; y++) { + for (unsigned ic = 0; ic < input_channels; ic++) { + correction += (weights[oc][x][y][ic] - weight_zero_point) * + (input_zero_point - 0x80); + } + } + } + } + + return correction; +} + +struct pipe_resource * +rkt_fill_biases(struct rkt_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + unsigned *truncate_bits) +{ + struct pipe_context *pcontext = subgraph->base.context; + unsigned output_channels = poperation->output_tensors[0]->dims[3]; + unsigned weights_size = poperation->conv.weight_tensor->dims[1]; + struct pipe_transfer *transfer_in, *transfer_out, *transfer_weights; + int32_t *biases_in = + pipe_buffer_map(pcontext, poperation->conv.bias_tensor->resource, + PIPE_MAP_READ, &transfer_in); + void *weights = + pipe_buffer_map(pcontext, poperation->conv.weight_tensor->resource, + PIPE_MAP_READ, &transfer_weights); + struct pipe_resource *rsc; + uint32_t *biases; + + rsc = pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT, + output_channels * sizeof(uint32_t)); + biases = pipe_buffer_map(pcontext, rsc, PIPE_MAP_WRITE, &transfer_out); + + // DBG("weight_scale %x\n", + // fui(poperation->conv.weight_tensor->scale)); + /* TODO: Figure out when exactly we need to truncate */ + /* From + * http://nvdla.org/hw/v1/ias/unit_description.html#convolution-accumulator : + * + * The final result of accumulator in CACC is 48bits for INT16 and 34bits for + * INT8. The bit width between CACC and SDP is 32. For precisions INT8 and + * INT16, there is a round and saturation operation before sending the result + * to SDP. The precision of rounding is configured by field CLIP_TRUNCATE in + * register D_CLIP_CFG. For FP16, the value is just converted from FP48 to + * FP32. + */ + if (fui(poperation->conv.weight_tensor->scale) == 0x3a88323f || + fui(poperation->conv.weight_tensor->scale) == 0x3c0060de || + fui(poperation->conv.weight_tensor->scale) == 0x3c06022d || + fui(poperation->conv.weight_tensor->scale) == 0x3c1642e3 || + fui(poperation->conv.weight_tensor->scale) == 0x3c1e3f51 || + fui(poperation->conv.weight_tensor->scale) == 0x3c5c8aa8 || + fui(poperation->conv.weight_tensor->scale) == 0x3c615e93 || + fui(poperation->conv.weight_tensor->scale) == 0x3c7326a2 || + fui(poperation->conv.weight_tensor->scale) == 0x3c783013 || + fui(poperation->conv.weight_tensor->scale) == 0x3d1748e6 || + fui(poperation->conv.weight_tensor->scale) == 0x3d282992 || + fui(poperation->conv.weight_tensor->scale) == 0x3d2e87ae || + fui(poperation->conv.weight_tensor->scale) == 0x3d77f5f6 || + fui(poperation->conv.weight_tensor->scale) == 0x3a9a5956 || + fui(poperation->conv.weight_tensor->scale) == 0x3caebc56) + *truncate_bits = 1; + else + *truncate_bits = 0; + + int32_t max_bias = 0; + int32_t max_corr = 0; + unsigned max_num_bits = 0; + bool retry = true; + while (retry) { + for (int oc = 0; oc < output_channels; oc++) { + int32_t corr = + calculate_bias_correction(subgraph, poperation, oc, weights); + biases[oc] = (biases_in[oc] - corr) / (1 << *truncate_bits); + + int64_t max_val = + (biases_in[oc] - corr + 255 * 255 * weights_size * weights_size) / + (1 << *truncate_bits); + unsigned num_bits = ceil(log(abs((int32_t)max_val)) / log(2)) + 1; + max_bias = MAX2(max_bias, biases[oc]); + max_corr = MAX2(max_corr, corr); + max_num_bits = MAX2(max_num_bits, num_bits); + + /* TODO: This doesn't actually work, num_bits doesn't go above 19, and the + * blob sometimes truncates way below */ + if (num_bits > 32) { + (*truncate_bits)++; + retry = true; + } else + retry = false; + } + } + + if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) { + static int task = 0; + rkt_dump_buffer((uint8_t *)biases, "biases", 0, task++, 0, + output_channels * sizeof(uint32_t)); + } + + pipe_buffer_unmap(pcontext, transfer_out); + + pipe_buffer_unmap(pcontext, transfer_weights); + + pipe_buffer_unmap(pcontext, transfer_in); + + return rsc; +} diff --git a/src/gallium/drivers/rocket/rkt_coefs.h b/src/gallium/drivers/rocket/rkt_coefs.h new file mode 100644 index 00000000000..d670cecfe3d --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_coefs.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#ifndef RKT_COEFS_H +#define RKT_COEFS_H + +#include "rkt_ml.h" + +struct pipe_resource * +rkt_fill_weights(struct rkt_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation); + +struct pipe_resource * +rkt_fill_biases(struct rkt_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + unsigned *truncate_bits); + +#endif /* RKT_COEFS_H */
\ No newline at end of file diff --git a/src/gallium/drivers/rocket/rkt_device.c b/src/gallium/drivers/rocket/rkt_device.c new file mode 100644 index 00000000000..9c2da6a2cd2 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_device.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include "rkt_device.h" +#include "rkt_ml.h" + +#include "drm-uapi/rocket_accel.h" + +#include <xf86drm.h> +#include "util/os_mman.h" +#include "util/ralloc.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/u_transfer.h" + +static const struct debug_named_value rocket_debug_options[] = { + {"dbg_msgs", ROCKET_DBG_MSGS, "Print debug messages"}, + {"dump_bos", ROCKET_DBG_DUMP_BOS, "Dump buffers for analysis"}, + {"zero_bos", ROCKET_DBG_ZERO, "Zero buffers for debugging"}, + DEBUG_NAMED_VALUE_END}; + +DEBUG_GET_ONCE_FLAGS_OPTION(rocket_debug, "ROCKET_DEBUG", rocket_debug_options, 0) +int rocket_debug = 0; + +static void +rkt_destroy_screen(struct pipe_screen *pscreen) +{ + struct rkt_screen *screen = rkt_screen(pscreen); + + if (screen->ro) + screen->ro->destroy(screen->ro); + + ralloc_free(screen); +} + +static void +rkt_destroy_context(struct pipe_context *pctx) +{ + struct rkt_context *ctx = rkt_context(pctx); + + ralloc_free(ctx); +} + +static void * +rkt_buffer_map(struct pipe_context *pctx, + struct pipe_resource *prsc, unsigned level, + unsigned usage, const struct pipe_box *box, + struct pipe_transfer **out_transfer) +{ + struct rkt_screen *screen = rkt_screen(pctx->screen); + struct rkt_resource *rsc = rkt_resource(prsc); + struct drm_rocket_prep_bo arg = {0}; + int ret; + + assert(level == 0); + assert(prsc->target == PIPE_BUFFER); + assert(box->y == 0); + assert(box->z == 0); + assert(box->height == 1); + assert(box->depth == 1); + + struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer); + transfer->level = level; + transfer->usage = usage; + transfer->box = *box; + + pipe_resource_reference(&transfer->resource, prsc); + + arg.handle = rsc->handle; + arg.timeout_ns = INT64_MAX; + + ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_PREP_BO, &arg); + assert(ret != -1); + + uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED, + screen->fd, rsc->fake_offset); + assert(map != MAP_FAILED); + + *out_transfer = transfer; + + return map + box->x; +} + +static void +rkt_buffer_unmap(struct pipe_context *pctx, + struct pipe_transfer *transfer) +{ + struct rkt_screen *screen = rkt_screen(pctx->screen); + struct rkt_resource *rsrc = rkt_resource(transfer->resource); + struct drm_rocket_fini_bo arg = {0}; + int ret; + + arg.handle = rsrc->handle; + + if (transfer->usage == PIPE_MAP_WRITE) { + ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_FINI_BO, &arg); + assert(ret >= 0); + } + + pipe_resource_reference(&transfer->resource, NULL); + ralloc_free(transfer); +} + +static struct pipe_context * +rkt_create_context(struct pipe_screen *screen, + void *priv, unsigned flags) +{ + struct rkt_context *ctx = rzalloc(NULL, struct rkt_context); + struct pipe_context *pctx = &ctx->base; + + if (!ctx) + return NULL; + + pctx->screen = screen; + pctx->priv = priv; + + pctx->destroy = rkt_destroy_context; + + pctx->buffer_map = rkt_buffer_map; + pctx->buffer_unmap = rkt_buffer_unmap; + pctx->resource_copy_region = util_resource_copy_region; + pctx->buffer_subdata = u_default_buffer_subdata; + pctx->clear_buffer = u_default_clear_buffer; + + pctx->ml_operation_supported = rkt_ml_operation_supported; + pctx->ml_subgraph_create = rkt_ml_subgraph_create; + pctx->ml_subgraph_invoke = rkt_ml_subgraph_invoke; + pctx->ml_subgraph_read_output = rkt_ml_subgraph_read_outputs; + pctx->ml_subgraph_destroy = rkt_ml_subgraph_destroy; + + return pctx; +} + +static struct pipe_resource * +rkt_resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *templat) +{ + struct rkt_screen *screen = rkt_screen(pscreen); + struct drm_rocket_create_bo arg = {0}; + struct rkt_resource *rsc; + int ret; + + assert(templat->target == PIPE_BUFFER); + assert(templat->height0 == 1); + assert(templat->depth0 == 1); + assert(templat->array_size == 1); + + rsc = rzalloc(NULL, struct rkt_resource); + if (!rsc) + return NULL; + + rsc->base = *templat; + rsc->base.screen = pscreen; + rsc->base.nr_samples = templat->nr_samples; + pipe_reference_init(&rsc->base.reference, 1); + + rsc->bo_size = templat->width0; + + arg.size = templat->width0; + + ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_CREATE_BO, &arg); + if (ret < 0) + goto free_rsc; + + rsc->handle = arg.handle; + rsc->phys_addr = arg.dma_address; + rsc->fake_offset = arg.offset; + + if (DBG_ENABLED(ROCKET_DBG_ZERO)) { + void *map = os_mmap(NULL, arg.size, PROT_READ | PROT_WRITE, MAP_SHARED, + screen->fd, rsc->fake_offset); + memset(map, 0, arg.size); + } + + return &rsc->base; + +free_rsc: + ralloc_free(rsc); + return NULL; +} + +static void +rkt_resource_destroy(struct pipe_screen *pscreen, + struct pipe_resource *prsc) +{ + struct rkt_resource *rsc = rkt_resource(prsc); + struct rkt_screen *screen = rkt_screen(pscreen); + struct drm_gem_close arg = {0}; + int ret; + + arg.handle = rsc->handle; + + ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg); + assert(ret >= 0); + + ralloc_free(rsc); +} + +static int +rkt_screen_get_fd(struct pipe_screen *pscreen) +{ + return rkt_screen(pscreen)->fd; +} + +struct pipe_screen * +rkt_screen_create(int fd, + const struct pipe_screen_config *config, + struct renderonly *ro) +{ + struct rkt_screen *rkt_screen; + struct pipe_screen *screen; + + rkt_screen = rzalloc(NULL, struct rkt_screen); + if (!rkt_screen) + return NULL; + + screen = &rkt_screen->pscreen; + + rocket_debug = debug_get_option_rocket_debug(); + + rkt_screen->fd = fd; + + screen->get_screen_fd = rkt_screen_get_fd; + screen->destroy = rkt_destroy_screen; + screen->context_create = rkt_create_context; + screen->resource_create = rkt_resource_create; + screen->resource_destroy = rkt_resource_destroy; + + return screen; +}
\ No newline at end of file diff --git a/src/gallium/drivers/rocket/rkt_device.h b/src/gallium/drivers/rocket/rkt_device.h new file mode 100644 index 00000000000..0425a4260d9 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_device.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "renderonly/renderonly.h" +#include "util/log.h" + +#ifndef RKT_SCREEN_H +#define RKT_SCREEN_H + +enum rkt_dbg { + ROCKET_DBG_MSGS = BITFIELD_BIT(0), + ROCKET_DBG_DUMP_BOS = BITFIELD_BIT(1), + ROCKET_DBG_ZERO = BITFIELD_BIT(2), +}; + +extern int rocket_debug; + +#define DBG_ENABLED(flag) unlikely(rocket_debug &(flag)) + +#define DBG(fmt, ...) \ + do { \ + if (DBG_ENABLED(ROCKET_DBG_MSGS)) \ + mesa_logd("%s:%d: " fmt, __func__, __LINE__, \ + ##__VA_ARGS__); \ + } while (0) + +struct rkt_screen { + struct pipe_screen pscreen; + + int fd; + struct renderonly *ro; +}; + +static inline struct rkt_screen * +rkt_screen(struct pipe_screen *p) +{ + return (struct rkt_screen *)p; +} + +struct rkt_context { + struct pipe_context base; +}; + +static inline struct rkt_context * +rkt_context(struct pipe_context *pctx) +{ + return (struct rkt_context *)pctx; +} + +struct rkt_resource { + struct pipe_resource base; + + uint32_t handle; + uint64_t phys_addr; + uint64_t obj_addr; + uint64_t fake_offset; + uint64_t bo_size; +}; + +static inline struct rkt_resource * +rkt_resource(struct pipe_resource *p) +{ + return (struct rkt_resource *)p; +} + +struct pipe_screen *rkt_screen_create(int fd, + const struct pipe_screen_config *config, + struct renderonly *ro); + +#endif /* RKT_SCREEN_H */ diff --git a/src/gallium/drivers/rocket/rkt_ml.c b/src/gallium/drivers/rocket/rkt_ml.c new file mode 100644 index 00000000000..129f76f7a43 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_ml.c @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_state.h" +#include "util/macros.h" +#include "util/u_dynarray.h" +#include "util/u_inlines.h" + +#include <xf86drm.h> + +#include "drm-uapi/rocket_accel.h" + +#include "rkt_coefs.h" +#include "rkt_ml.h" +#include "rkt_regcmd.h" +#include "rkt_task.h" + +void +rkt_dump_buffer(const uint8_t *ptr, char *name, int operation_nr, + int suboperation_nr, int offset, unsigned size) +{ + char buffer[255]; + + snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr, + suboperation_nr); + + FILE *f = fopen(buffer, "wb"); + assert(f); + fwrite(ptr + offset, 1, size, f); + if (ferror(f)) { + DBG("Error in writing to file: %s\n", strerror(errno)); + } + fflush(f); + fclose(f); +} + +static void +create_tensor(struct rkt_ml_subgraph *subgraph, unsigned idx, + unsigned size) +{ + struct pipe_context *context = subgraph->base.context; + struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors); + + assert(idx < util_dynarray_num_elements(&subgraph->tensors, + struct pipe_resource *)); + + struct pipe_resource *res = tensors[idx]; + + if (res != NULL) { + assert(size == pipe_buffer_size(res)); + return; + } + + res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, size); + tensors[idx] = res; +} + +struct rkt_resource * +rkt_get_tensor(struct rkt_ml_subgraph *subgraph, + unsigned idx) +{ + return rkt_resource( + *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx)); +} + +bool +rkt_is_depthwise(const struct pipe_ml_operation *poperation) +{ + unsigned input_channels = poperation->input_tensors[0]->dims[3]; + unsigned output_channels = poperation->output_tensors[0]->dims[3]; + + return poperation->conv.depthwise && input_channels > 1 && + output_channels > 1; +} + +static unsigned +calc_raw_output_size(struct rkt_operation *operation) +{ + unsigned output_channels_1 = + DIV_ROUND_UP(operation->output_channels, FEATURE_ATOMIC_SIZE) * 2; + unsigned output_channels_2 = FEATURE_ATOMIC_SIZE; + + return operation->output_width * operation->output_height * + output_channels_1 * output_channels_2; +} + +static void +compile_operation(struct rkt_ml_subgraph *subgraph, + struct rkt_operation *operation) +{ + struct pipe_context *pcontext = subgraph->base.context; + unsigned regcfg_total_size = 0; + struct util_dynarray *regcfgs; + struct pipe_transfer *transfer = NULL; + unsigned num_tasks = + util_dynarray_num_elements(&operation->tasks, struct split_task); + + regcfgs = calloc(num_tasks, sizeof(struct util_dynarray)); + + for (int i = 0; i < num_tasks; i++) { + util_dynarray_init(®cfgs[i], NULL); + rkt_fill_regcmd(subgraph, operation, ®cfgs[i], i); + + unsigned size = + util_dynarray_num_elements(®cfgs[i], uint64_t) * sizeof(uint64_t); + regcfg_total_size += ALIGN(size, 64); + } + + operation->regcmd = pipe_buffer_create(pcontext->screen, 0, + PIPE_USAGE_DEFAULT, regcfg_total_size); + uint8_t *regcmd = + pipe_buffer_map(pcontext, operation->regcmd, PIPE_MAP_WRITE, &transfer); + + unsigned regcmd_offset = 0; + for (int i = 0; i < num_tasks; i++) { + unsigned size = util_dynarray_num_elements(®cfgs[i], uint64_t); + struct split_task *task = + util_dynarray_element(&operation->tasks, struct split_task, i); + + if (i < num_tasks - 1) { + /* Patch next address and amount of regs to fetch, positions are relative + * to end */ + unsigned reg_count = util_dynarray_num_elements(®cfgs[i], uint64_t); + uint64_t *next_address_reg = + util_dynarray_element(®cfgs[i], uint64_t, reg_count - 4); + uint64_t *reg_count_reg = + util_dynarray_element(®cfgs[i], uint64_t, reg_count - 3); + + uint64_t addr = rkt_resource(operation->regcmd)->phys_addr + + regcmd_offset + ALIGN(size * sizeof(uint64_t), 64); + *next_address_reg |= addr << 16; + + unsigned regs_to_fetch = + util_dynarray_num_elements(®cfgs[i + 1], uint64_t); + regs_to_fetch -= 4; + regs_to_fetch = ALIGN(regs_to_fetch / 2, 2); + *reg_count_reg |= regs_to_fetch << 16; + } + + memcpy(regcmd + regcmd_offset, util_dynarray_begin(®cfgs[i]), + size * sizeof(uint64_t)); + util_dynarray_fini(®cfgs[i]); + + task->regcfg_amount = size; + task->regcfg_addr = + rkt_resource(operation->regcmd)->phys_addr + regcmd_offset; + + if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) + rkt_dump_buffer(regcmd, "regcmd", 0, i, regcmd_offset, + (size + 4) * sizeof(uint64_t)); + + regcmd_offset += ALIGN(size * sizeof(uint64_t), 64); + } + + pipe_buffer_unmap(pcontext, transfer); + + for (int i = 0; i < num_tasks; i++) { + util_dynarray_fini(®cfgs[i]); + } + + free(regcfgs); +} + +static void +lower_convolution(struct rkt_ml_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct rkt_operation *operation) +{ + util_dynarray_init(&operation->tasks, NULL); + + operation->depthwise = rkt_is_depthwise(poperation); + operation->padding_same = poperation->conv.padding_same; + operation->stride = poperation->conv.stride_x; + + operation->input_index = poperation->input_tensors[0]->index; + operation->input_width = poperation->input_tensors[0]->dims[1]; + operation->input_height = poperation->input_tensors[0]->dims[2]; + operation->input_channels = poperation->input_tensors[0]->dims[3]; + operation->input_zero_point = poperation->input_tensors[0]->zero_point; + operation->input_scale = poperation->input_tensors[0]->scale; + + operation->output_index = poperation->output_tensors[0]->index; + operation->output_width = poperation->output_tensors[0]->dims[1]; + operation->output_height = poperation->output_tensors[0]->dims[2]; + operation->output_channels = poperation->output_tensors[0]->dims[3]; + operation->output_zero_point = poperation->output_tensors[0]->zero_point; + operation->output_scale = poperation->output_tensors[0]->scale; + + operation->weights_width = poperation->conv.weight_tensor->dims[1]; + operation->weights_height = poperation->conv.weight_tensor->dims[2]; + operation->weights_zero_point = poperation->conv.weight_tensor->zero_point; + operation->weights_scale = poperation->conv.weight_tensor->scale; + + operation->weights = rkt_fill_weights(subgraph, poperation); + operation->biases = + rkt_fill_biases(subgraph, poperation, &operation->truncate_bits); +} + +static struct rkt_operation * +find_first_consumer(struct rkt_ml_subgraph *subgraph, unsigned tensor_index) +{ + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, + operation) { + if (operation->input_index == tensor_index) + return operation; + } + + return NULL; +} + +static struct rkt_operation * +find_producer(struct rkt_ml_subgraph *subgraph, + unsigned tensor_index) +{ + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, + operation) { + if (operation->output_index == tensor_index) + return operation; + } + + return NULL; +} + +static unsigned +count_tensors(const struct pipe_ml_operation *poperations, + unsigned count) +{ + unsigned tensor_count = 0; + + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + tensor_count = MAX2(tensor_count, poperation->input_tensors[0]->index); + tensor_count = MAX2(tensor_count, poperation->output_tensors[0]->index); + switch (poperation->type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: + tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index); + tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index); + break; + case PIPE_ML_OPERATION_TYPE_ADD: + tensor_count = MAX2(tensor_count, poperation->input_tensors[1]->index); + break; + default: + DBG("poperation->type %d\n", poperation->type); + unreachable("Unsupported ML operation type"); + } + } + + return tensor_count + 1; +} + +static bool +tensor_quantization_supported(struct pipe_tensor *tensor) +{ + /* + * Per-axis quantization not supported, for details see: + * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor + */ + return tensor->scales == NULL && tensor->zero_points == NULL; +} + +bool +rkt_ml_operation_supported(struct pipe_context *pcontext, + const struct pipe_ml_operation *operation) +{ + bool supported = false; + + switch (operation->type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { + struct pipe_tensor *input_tensor = operation->input_tensors[0]; + struct pipe_tensor *weight_tensor = operation->conv.weight_tensor; + struct pipe_tensor *bias_tensor = operation->conv.bias_tensor; + struct pipe_tensor *output_tensor = operation->output_tensors[0]; + + // Dilation and per-axis quantization not yet implemented + if (tensor_quantization_supported(input_tensor) && + tensor_quantization_supported(weight_tensor) && + tensor_quantization_supported(bias_tensor) && + tensor_quantization_supported(output_tensor) && + operation->conv.dilation_width_factor == 1 && + operation->conv.dilation_height_factor == 1) + supported = true; + + break; + } + case PIPE_ML_OPERATION_TYPE_ADD: + supported = operation->input_tensors[0]->resource == NULL && + operation->input_tensors[1]->resource == NULL; + break; + default: + supported = false; + } + + return supported; +} + +struct pipe_ml_subgraph * +rkt_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count) +{ + struct rkt_ml_subgraph *subgraph; + unsigned tensor_count; + + subgraph = calloc(1, sizeof(*subgraph)); + subgraph->base.context = pcontext; + + tensor_count = count_tensors(poperations, count); + util_dynarray_init(&subgraph->tensors, NULL); + util_dynarray_init(&subgraph->operations, NULL); + if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *, + tensor_count)) + return NULL; + memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size); + + /* Lower */ + for (int i = 0; i < count; i++) { + struct rkt_operation operation = {0}; + operation.add_tensor = -1; + + switch (poperations[i].type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: + lower_convolution(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, struct rkt_operation, + operation); + break; + case PIPE_ML_OPERATION_TYPE_ADD: { + /* Fuse tensor addition into convolution*/ + struct rkt_operation *input_op_1 = + find_producer(subgraph, poperations[i].input_tensors[1]->index); + struct rkt_operation *input_op_2 = + find_producer(subgraph, poperations[i].input_tensors[0]->index); + + assert(input_op_1); + assert(input_op_2); + + if (input_op_1 == NULL) { + /* Graph input */ + input_op_2->add_tensor = poperations[i].input_tensors[1]->index; + } else { + input_op_1->addition_input = true; + input_op_2->add_tensor = input_op_1->output_index; + } + + input_op_2->output_index = poperations[i].output_tensors[0]->index; + input_op_2->addition_offset = + 0x80 - poperations[i].input_tensors[1]->zero_point; + input_op_2->addition_scale = poperations[i].input_tensors[1]->scale; + + break; + } + default: + DBG("poperation->type %d\n", poperations[i].type); + unreachable("Unsupported ML operation type"); + } + } + + /* Create input tensors */ + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, + operation) { + unsigned input_channels_1 = + DIV_ROUND_UP(operation->input_channels, FEATURE_ATOMIC_SIZE) * 2; + unsigned input_channels_2 = FEATURE_ATOMIC_SIZE; + unsigned input_size = operation->input_width * operation->input_height * + input_channels_1 * input_channels_2; + + create_tensor(subgraph, operation->input_index, input_size); + } + + /* Create output tensors */ + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, + operation) { + struct rkt_resource *res = + rkt_get_tensor(subgraph, operation->output_index); + if (res != NULL) + continue; + + create_tensor(subgraph, operation->output_index, + calc_raw_output_size(operation)); + } + + /* Compile */ + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, + operation) { + rkt_split_tasks(subgraph, operation); + compile_operation(subgraph, operation); + } + + return &subgraph->base; +} + +void +rkt_ml_subgraph_invoke(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned inputs_count, unsigned input_idxs[], + void *inputs[], bool is_signed[]) +{ + struct rkt_screen *screen = rkt_screen(pcontext->screen); + struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph); + int ret; + + DBG("Processing input\n"); + + for (int i = 0; i < inputs_count; i++) { + struct rkt_operation *operation = + find_first_consumer(subgraph, input_idxs[i]); + struct pipe_resource *input = + &rkt_get_tensor(subgraph, input_idxs[i])->base; + unsigned input_channels = operation->input_channels; + unsigned output_channels = operation->output_channels; + + struct rkt_resource *input_tensor = + rkt_get_tensor(subgraph, operation->input_index); + if (output_channels == 1 && input_channels == 1 && + !operation->addition_input && (operation->add_tensor == -1)) { + pipe_buffer_copy(pcontext, &input_tensor->base, input, 0, 0, + pipe_buffer_size(input)); + } else { + unsigned input_width = operation->input_width; + unsigned input_height = operation->input_height; + unsigned zero_point = operation->input_zero_point; + struct pipe_transfer *transfer_out; + uint8_t(*input_in)[input_height][input_channels] = inputs[i]; + uint8_t *map = pipe_buffer_map(pcontext, &input_tensor->base, + PIPE_MAP_WRITE, &transfer_out); + + DBG("Converting data\n"); + + /* + * From the NVDLA docs: "For int8, one element of data refers to an 8-bit + * signed integer." But only when transposing do we seem to need to + * convert to signed. The DMA unit seems to be able to convert from + * unsigned to signed though. + */ + if (input_channels == 1) { + unsigned n = 0; + for (int x = 0; x < input_width; x++) { + for (int y = 0; y < MAX2(input_height, FEATURE_ATOMIC_SIZE); y++) { + if (y < input_height) + map[n++] = input_in[x][y][0]; + else + map[n++] = zero_point; + } + } + } else { + unsigned n = 0; + for (int u = 0; u < DIV_ROUND_UP(input_channels, FEATURE_ATOMIC_SIZE); + u++) { + for (int x = 0; x < input_width; x++) { + for (int y = 0; y < input_height; y++) { + for (int c = 0; c < FEATURE_ATOMIC_SIZE; c++) { + unsigned input_channel = c + u * FEATURE_ATOMIC_SIZE; + if (input_channel < input_channels) + map[n++] = input_in[x][y][input_channel] - 0x80; + else + map[n++] = zero_point - 0x80; + } + } + } + } + } + + if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) + rkt_dump_buffer(map, "input", 0, 0, 0, + rkt_get_tensor(subgraph, input_idxs[i])->bo_size); + + DBG("Converted data\n"); + + pipe_buffer_unmap(pcontext, transfer_out); + } + } + DBG("Processed input\n"); + + DBG("Submitting graph\n"); + + struct util_dynarray jobs = {0}; + util_dynarray_init(&jobs, NULL); + + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, + operation) { + unsigned num_inputs = operation->add_tensor != -1 ? 2 : 1; + uint32_t *in_bo_handles = calloc(num_inputs, sizeof(uint32_t)); + uint32_t *out_bo_handles = malloc(sizeof(uint32_t)); + + in_bo_handles[0] = rkt_get_tensor(subgraph, operation->input_index)->handle; + + if (operation->add_tensor != -1) + in_bo_handles[1] = + rkt_get_tensor(subgraph, operation->add_tensor)->handle; + + out_bo_handles[0] = + rkt_get_tensor(subgraph, operation->output_index)->handle; + + if (operation->reuse_weights_cbuf) { + /* Submit all tasks to the same core, so weights can be reused */ + unsigned num_tasks = + util_dynarray_num_elements(&operation->tasks, struct split_task); + struct drm_rocket_task *tasks = calloc(num_tasks, sizeof(*tasks)); + unsigned task_count = 0; + util_dynarray_foreach (&operation->tasks, struct split_task, task) { + tasks[task_count].regcmd = task->regcfg_addr; + tasks[task_count].regcmd_count = task->regcfg_amount; + task_count++; + } + struct drm_rocket_job job = {0}; + job.task_struct_size = sizeof(struct drm_rocket_task); + job.in_bo_handles = (uint64_t)(uintptr_t)in_bo_handles; + job.in_bo_handle_count = num_inputs; + job.out_bo_handles = (uint64_t)(uintptr_t)out_bo_handles; + job.out_bo_handle_count = 1; + job.tasks = (uint64_t)tasks; + job.task_count = task_count; + util_dynarray_append(&jobs, struct drm_rocket_job, job); + } else { + /* Spread tasks among cores, for parallelism */ + util_dynarray_foreach (&operation->tasks, struct split_task, task) { + struct drm_rocket_task *ktask = calloc(1, sizeof(*ktask)); + ktask->regcmd = task->regcfg_addr; + ktask->regcmd_count = task->regcfg_amount; + + struct drm_rocket_job job = {0}; + job.task_struct_size = sizeof(struct drm_rocket_task); + job.in_bo_handles = (uint64_t)(uintptr_t)in_bo_handles; + job.in_bo_handle_count = num_inputs; + job.out_bo_handles = (uint64_t)(uintptr_t)out_bo_handles; + job.out_bo_handle_count = 1; + job.tasks = (uint64_t)ktask; + job.task_count = 1; + util_dynarray_append(&jobs, struct drm_rocket_job, job); + } + } + } + + struct drm_rocket_submit submit = {0}; + submit.job_struct_size = sizeof(struct drm_rocket_job); + submit.jobs = (uint64_t)util_dynarray_begin(&jobs); + submit.job_count = util_dynarray_num_elements(&jobs, struct drm_rocket_job); + + ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_SUBMIT, &submit); + assert(ret == 0); + + util_dynarray_foreach (&jobs, struct drm_rocket_job, job) { + free((void *)job->in_bo_handles); + free((void *)job->out_bo_handles); + free((void *)job->tasks); + } + util_dynarray_fini(&jobs); + + DBG("Submitted graph\n"); +} + +void +rkt_ml_subgraph_read_outputs(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, + unsigned output_idxs[], void *outputs[], + bool is_signed[]) +{ + struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph); + + DBG("Processing output\n"); + + for (int i = 0; i < outputs_count; i++) { + + struct rkt_operation *operation = find_producer(subgraph, output_idxs[i]); + struct rkt_resource *output_tensor = + rkt_get_tensor(subgraph, output_idxs[i]); + struct pipe_transfer *transfer = NULL; + uint8_t *raw_output; + uint8_t(*output_in)[operation->output_height][operation->output_width] + [FEATURE_ATOMIC_SIZE]; + uint8_t(*output_out)[operation->output_width][operation->output_channels]; + + DBG("Before pipe_buffer_map\n"); + raw_output = pipe_buffer_map(pcontext, &output_tensor->base, PIPE_MAP_READ, + &transfer); + DBG("After pipe_buffer_map\n"); + + DBG("Converting data\n"); + + output_in = (void *)raw_output; + output_out = (void *)outputs[i]; + + if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) + rkt_dump_buffer(raw_output, "output", 0, 0, 0, output_tensor->bo_size); + + for (int oc = 0; oc < operation->output_channels; oc++) { + for (int x = 0; x < operation->output_width; x++) { + for (int y = 0; y < operation->output_height; y++) { + unsigned c = oc % FEATURE_ATOMIC_SIZE; + unsigned g = oc / FEATURE_ATOMIC_SIZE; + output_out[y][x][oc] = output_in[g][y][x][c] + 0x80; + } + } + } + + DBG("Converted data\n"); + + pipe_buffer_unmap(pcontext, transfer); + } + + DBG("Processed output\n"); +} + +static void +free_operation(struct rkt_operation *operation) +{ + util_dynarray_fini(&operation->tasks); + pipe_resource_reference(&operation->regcmd, NULL); + pipe_resource_reference(&operation->weights, NULL); + pipe_resource_reference(&operation->biases, NULL); +} + +void +rkt_ml_subgraph_destroy(struct pipe_context *context, + struct pipe_ml_subgraph *psubgraph) +{ + struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph); + + util_dynarray_foreach (&subgraph->operations, struct rkt_operation, operation) + free_operation(operation); + util_dynarray_fini(&subgraph->operations); + + util_dynarray_foreach (&subgraph->tensors, struct pipe_resource *, tensor) + if (tensor) + pipe_resource_reference(tensor, NULL); + util_dynarray_fini(&subgraph->tensors); + + free(subgraph); +} diff --git a/src/gallium/drivers/rocket/rkt_ml.h b/src/gallium/drivers/rocket/rkt_ml.h new file mode 100644 index 00000000000..04dea3d1475 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_ml.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#ifndef RKT_ML_H +#define RKT_ML_H + +#include <util/u_dynarray.h> + +#include "rkt_device.h" + +// http://nvdla.org/hw/v1/ias/unit_description.html#convolution-buffer +#define CBUF_BANK_SIZE 32768 +#define CBUF_BANKS 12 +#define CBUF_ENTRIES_PER_BANK 256 +#define CBUF_ENTRY_SIZE (CBUF_BANK_SIZE / CBUF_ENTRIES_PER_BANK) +#define FEATURE_ATOMIC_SIZE 16 +#define WEIGHT_ATOMIC_SIZE 32 +#define ATOMIC_K_SIZE 16 + +struct split_task { + unsigned num; + + unsigned top_slice; + unsigned bottom_slice; + unsigned num_overlap_slices; + unsigned num_retain_slices; + unsigned convolutions; + + unsigned pad_top; + unsigned pad_bottom; + unsigned pad_left; + unsigned pad_right; + + unsigned stride_x; + unsigned stride_y; + + unsigned input_width; + unsigned input_height; + unsigned input_channels; + unsigned input_channels_real; + unsigned input_zero_point; + float input_scale; + unsigned input_data_entries; + int input_line_stride; + int input_surface_stride; + unsigned input_offset; + + unsigned output_width; + unsigned output_height; + unsigned output_channels; + unsigned output_channels_real; + unsigned output_zero_point; + float output_scale; + int output_surface_stride; + unsigned output_offset; + + unsigned weights_width; + unsigned weights_height; + unsigned weights_kernels; + unsigned weights_zero_point; + float weights_scale; + + unsigned input_banks; + unsigned weights_banks; + + unsigned atomic_count; + unsigned surfaces_per_row; + + unsigned regcfg_amount; + uint32_t regcfg_addr; +}; + +struct rkt_operation { + struct pipe_resource *regcmd; + struct pipe_resource *weights; + struct pipe_resource *biases; + + bool depthwise; + bool reuse_weights_cbuf; + unsigned truncate_bits; + bool padding_same; + unsigned stride; + + bool addition_input; + int addition_offset; + float addition_scale; + + unsigned input_index; + unsigned input_width; + unsigned input_height; + unsigned input_channels; + uint8_t input_zero_point; + float input_scale; + + unsigned output_index; + unsigned output_width; + unsigned output_height; + unsigned output_channels; + uint8_t output_zero_point; + float output_scale; + + unsigned weights_width; + unsigned weights_height; + uint8_t weights_zero_point; + float weights_scale; + + int add_tensor; + + struct util_dynarray tasks; /* struct split_task */ +}; + +struct rkt_ml_subgraph { + struct pipe_ml_subgraph base; + + struct util_dynarray operations; /* rkt_operation */ + struct util_dynarray tensors; /* pipe_resource* */ +}; + +bool +rkt_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation); + +struct pipe_ml_subgraph * +rkt_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count); + +void rkt_ml_subgraph_invoke(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned inputs_count, unsigned input_idxs[], + void *inputs[], bool is_signed[]); + +void rkt_ml_subgraph_read_outputs(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, + unsigned output_idxs[], void *outputs[], + bool is_signed[]); + +void rkt_ml_subgraph_destroy(struct pipe_context *context, + struct pipe_ml_subgraph *psubgraph); + +struct rkt_resource *rkt_get_tensor(struct rkt_ml_subgraph *subgraph, + unsigned idx); + +bool rkt_is_depthwise(const struct pipe_ml_operation *poperation); + +void rkt_dump_buffer(const uint8_t *ptr, char *name, int operation_nr, + int suboperation_nr, int offset, unsigned size); + +#endif /* RKT_ML_H */ diff --git a/src/gallium/drivers/rocket/rkt_regcmd.c b/src/gallium/drivers/rocket/rkt_regcmd.c new file mode 100644 index 00000000000..be992fd5069 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_regcmd.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include "rkt_regcmd.h" +#include "rkt_ml.h" +#include "rkt_registers.h" + +static void +emit_raw(struct util_dynarray *regs, uint32_t target, uint32_t reg, + uint32_t value) +{ + uint64_t packed_value = 0; + packed_value = ((uint64_t)target) << 48; + packed_value |= ((uint64_t)value) << 16; + packed_value |= (uint64_t)reg; + + util_dynarray_append(regs, uint64_t, packed_value); +} + +static void +emit(struct util_dynarray *regs, uint32_t reg, uint32_t value) +{ + uint32_t target = rkt_get_target(reg) + 0x1; + emit_raw(regs, target, reg, value); +} + +#define EMIT(offset, value) emit(regs, offset, value); + +static void +fill_first_regcmd(struct rkt_ml_subgraph *subgraph, + const struct rkt_operation *operation, + struct util_dynarray *regs, unsigned task_num) +{ + struct split_task *task = + util_dynarray_element(&operation->tasks, struct split_task, task_num); + unsigned num_tasks = + util_dynarray_num_elements(&operation->tasks, struct split_task); + unsigned output_zero_point = task->output_zero_point; + unsigned weights_zero_point = task->weights_zero_point; + unsigned offset = output_zero_point - 0x80; + + uint32_t con0 = CNA_CBUF_CON0_WEIGHT_BANK(task->weights_banks) | + CNA_CBUF_CON0_DATA_BANK(task->input_banks); + if (task_num > 0 && operation->reuse_weights_cbuf) + con0 |= CNA_CBUF_CON0_WEIGHT_REUSE(1); + + EMIT(REG_CNA_CBUF_CON0, con0); + + EMIT(REG_CNA_DCOMP_REGNUM, 0); + EMIT(REG_CNA_DCOMP_CTRL, 0); + + uint32_t con1 = 0x0; + if (task->input_channels_real == 1) { + con1 |= CNA_CONV_CON1_NONALIGN_DMA(1) | CNA_CONV_CON1_GROUP_LINE_OFF(1) | + CNA_CONV_CON1_ARGB_IN(8); + } + + if (operation->depthwise) + con1 |= CNA_CONV_CON1_CONV_MODE(3); + + EMIT(REG_CNA_CONV_CON1, con1); + + EMIT(REG_DPU_S_POINTER, DPU_S_POINTER_POINTER_PP_MODE(1) | + DPU_S_POINTER_EXECUTER_PP_EN(1) | + DPU_S_POINTER_POINTER_PP_EN(1)); + EMIT(REG_DPU_RDMA_RDMA_S_POINTER, + DPU_RDMA_RDMA_S_POINTER_POINTER_PP_MODE(1) | + DPU_RDMA_RDMA_S_POINTER_EXECUTER_PP_EN(1) | + DPU_RDMA_RDMA_S_POINTER_POINTER_PP_EN(1)); + EMIT(REG_CNA_CONV_CON1, con1); + EMIT(REG_CNA_CONV_CON2, + CNA_CONV_CON2_FEATURE_GRAINS( + 50 + task->stride_y + 1)); /* Magic: Seems to pass the most tests */ + EMIT(REG_CNA_CONV_CON3, CNA_CONV_CON3_CONV_X_STRIDE(task->stride_x) | + CNA_CONV_CON3_CONV_Y_STRIDE(task->stride_y)); + EMIT(REG_CNA_DATA_SIZE0, + CNA_DATA_SIZE0_DATAIN_WIDTH(task->input_width) | + CNA_DATA_SIZE0_DATAIN_HEIGHT(task->input_height)); + + EMIT(REG_CNA_DATA_SIZE1, + CNA_DATA_SIZE1_DATAIN_CHANNEL_REAL(task->input_channels_real - 1) | + CNA_DATA_SIZE1_DATAIN_CHANNEL(task->input_channels)); + + EMIT(REG_CNA_DATA_SIZE2, CNA_DATA_SIZE2_DATAOUT_WIDTH(task->output_width)); + EMIT(REG_CNA_DATA_SIZE3, CNA_DATA_SIZE3_DATAOUT_ATOMICS(task->atomic_count)); + EMIT(REG_CNA_WEIGHT_SIZE0, task->weights_width * task->weights_height * + task->input_channels * task->weights_kernels); + EMIT(REG_CNA_WEIGHT_SIZE1, + task->weights_width * task->weights_height * task->input_channels); + EMIT(REG_CNA_WEIGHT_SIZE2, + CNA_WEIGHT_SIZE2_WEIGHT_WIDTH(task->weights_width) | + CNA_WEIGHT_SIZE2_WEIGHT_HEIGHT(task->weights_height) | + CNA_WEIGHT_SIZE2_WEIGHT_KERNELS(task->weights_kernels)); + + EMIT(REG_CNA_CBUF_CON0, con0); + + EMIT(REG_CNA_CBUF_CON1, CNA_CBUF_CON1_DATA_ENTRIES(task->input_data_entries)); + + if (task->input_channels_real == 1) { + unsigned truncate = 14; + unsigned scale = 16384; + unsigned offset = 65408; + + if (operation->addition_input || operation->add_tensor != -1) { + truncate = 15; + scale = 32388; + } + + EMIT(REG_CNA_CVT_CON0, CNA_CVT_CON0_CVT_TRUNCATE_3(truncate) | + CNA_CVT_CON0_CVT_TRUNCATE_2(truncate) | + CNA_CVT_CON0_CVT_TRUNCATE_1(truncate) | + CNA_CVT_CON0_CVT_TRUNCATE_0(truncate)); + EMIT(REG_CNA_CVT_CON1, + CNA_CVT_CON1_CVT_SCALE0(scale) | CNA_CVT_CON1_CVT_OFFSET0(offset)); + EMIT(REG_CNA_CVT_CON2, + CNA_CVT_CON2_CVT_SCALE1(scale) | CNA_CVT_CON2_CVT_OFFSET1(offset)); + EMIT(REG_CNA_CVT_CON3, + CNA_CVT_CON3_CVT_SCALE2(scale) | CNA_CVT_CON3_CVT_OFFSET2(offset)); + EMIT(REG_CNA_CVT_CON4, + CNA_CVT_CON4_CVT_SCALE3(scale) | CNA_CVT_CON4_CVT_OFFSET3(offset)); + } else { + EMIT(REG_CNA_CVT_CON0, CNA_CVT_CON0_DATA_SIGN(1) | + CNA_CVT_CON0_CVT_TYPE(1) | + CNA_CVT_CON0_CVT_BYPASS(1)); + EMIT(REG_CNA_CVT_CON1, CNA_CVT_CON1_CVT_SCALE0(1)); + EMIT(REG_CNA_CVT_CON2, CNA_CVT_CON2_CVT_SCALE1(1)); + EMIT(REG_CNA_CVT_CON3, CNA_CVT_CON3_CVT_SCALE2(1)); + EMIT(REG_CNA_CVT_CON4, CNA_CVT_CON4_CVT_SCALE3(1)); + } + + EMIT(REG_CNA_FC_CON0, 0); + EMIT(REG_CNA_FC_CON1, 0); + EMIT(REG_CNA_PAD_CON0, CNA_PAD_CON0_PAD_LEFT(task->pad_left) | + CNA_PAD_CON0_PAD_TOP(task->pad_top)); + EMIT(REG_CNA_FEATURE_DATA_ADDR, + rkt_get_tensor(subgraph, operation->input_index)->phys_addr + + task->input_offset); + EMIT(REG_CNA_FC_CON2, 0); + EMIT(REG_CNA_DMA_CON0, + CNA_DMA_CON0_WEIGHT_BURST_LEN(15) | CNA_DMA_CON0_DATA_BURST_LEN(15)); + EMIT(REG_CNA_DMA_CON1, CNA_DMA_CON1_LINE_STRIDE(task->input_line_stride)); + EMIT(REG_CNA_DMA_CON2, CNA_DMA_CON2_SURF_STRIDE(task->input_surface_stride)); + + EMIT(REG_CNA_FC_DATA_SIZE0, + CNA_FC_DATA_SIZE0_DMA_WIDTH(operation->input_width) | + CNA_FC_DATA_SIZE0_DMA_HEIGHT(task->input_height)); + + EMIT(REG_CNA_FC_DATA_SIZE1, + CNA_FC_DATA_SIZE1_DMA_CHANNEL(task->input_channels)); + EMIT(REG_CNA_DCOMP_CTRL, 0); + EMIT(REG_CNA_DCOMP_REGNUM, 0); + EMIT(REG_CNA_DCOMP_ADDR0, rkt_resource(operation->weights)->phys_addr); + EMIT(REG_CNA_DCOMP_AMOUNT0, 0); + EMIT(REG_CNA_DCOMP_AMOUNT1, 0); + EMIT(REG_CNA_DCOMP_AMOUNT2, 0); + EMIT(REG_CNA_DCOMP_AMOUNT3, 0); + EMIT(REG_CNA_DCOMP_AMOUNT4, 0); + EMIT(REG_CNA_DCOMP_AMOUNT5, 0); + EMIT(REG_CNA_DCOMP_AMOUNT6, 0); + EMIT(REG_CNA_DCOMP_AMOUNT7, 0); + EMIT(REG_CNA_DCOMP_AMOUNT8, 0); + EMIT(REG_CNA_DCOMP_AMOUNT9, 0); + EMIT(REG_CNA_DCOMP_AMOUNT10, 0); + EMIT(REG_CNA_DCOMP_AMOUNT11, 0); + EMIT(REG_CNA_DCOMP_AMOUNT12, 0); + EMIT(REG_CNA_DCOMP_AMOUNT13, 0); + EMIT(REG_CNA_DCOMP_AMOUNT14, 0); + EMIT(REG_CNA_DCOMP_AMOUNT15, 0); + + if (task->input_channels_real == 1) { + EMIT(REG_CNA_CVT_CON5, 65535); + } else { + EMIT(REG_CNA_CVT_CON5, 0); + } + + int32_t pad_con1; + if (task->weights_width >= 3 && task->input_zero_point == 0x0) + pad_con1 = 0xffff8080; + else + pad_con1 = task->input_zero_point - 0x80; + + if (operation->addition_input || operation->add_tensor != -1) + pad_con1 = 0xffffff80; + + if (operation->depthwise && task->input_zero_point == 0x8b) + pad_con1 = 0x0b0b; + + EMIT(REG_CNA_PAD_CON1, pad_con1); + + uint32_t misc_cfg = CORE_MISC_CFG_QD_EN(1); + if (operation->depthwise) + misc_cfg |= CORE_MISC_CFG_DW_EN(1); + + EMIT(REG_CORE_MISC_CFG, misc_cfg); + EMIT(REG_CORE_DATAOUT_SIZE_0, + CORE_DATAOUT_SIZE_0_DATAOUT_HEIGHT(task->output_height - 1) | + CORE_DATAOUT_SIZE_0_DATAOUT_WIDTH(task->output_width - 1)); + EMIT(REG_CORE_DATAOUT_SIZE_1, + CORE_DATAOUT_SIZE_1_DATAOUT_CHANNEL(task->output_channels - 1)); + EMIT(REG_CORE_CLIP_TRUNCATE, + CORE_CLIP_TRUNCATE_CLIP_TRUNCATE(operation->truncate_bits)); + emit_raw(regs, CORE | 0x1, 0x3030, 0); + + uint32_t feat_mode_cfg = + DPU_FEATURE_MODE_CFG_BURST_LEN(15) | DPU_FEATURE_MODE_CFG_OUTPUT_MODE(2); + if (operation->depthwise) + feat_mode_cfg |= DPU_FEATURE_MODE_CFG_CONV_MODE(3); + + EMIT(REG_DPU_FEATURE_MODE_CFG, feat_mode_cfg); + EMIT(REG_DPU_DATA_FORMAT, 0); + EMIT(REG_DPU_OFFSET_PEND, 0); + EMIT(REG_DPU_DST_BASE_ADDR, + rkt_get_tensor(subgraph, operation->output_index)->phys_addr + + task->output_offset); + EMIT(REG_DPU_DST_SURF_STRIDE, + DPU_DST_SURF_STRIDE_DST_SURF_STRIDE(task->output_surface_stride)); + EMIT(REG_DPU_DATA_CUBE_WIDTH, + DPU_DATA_CUBE_WIDTH_WIDTH(task->output_width - 1)); + EMIT(REG_DPU_DATA_CUBE_HEIGHT, + DPU_DATA_CUBE_HEIGHT_HEIGHT(task->output_height - 1)); + EMIT(REG_DPU_DATA_CUBE_NOTCH_ADDR, 0); + EMIT(REG_DPU_DATA_CUBE_CHANNEL, + DPU_DATA_CUBE_CHANNEL_ORIG_CHANNEL(task->output_channels_real - 1) | + DPU_DATA_CUBE_CHANNEL_CHANNEL(task->output_channels - 1)); + EMIT(REG_DPU_BS_CFG, DPU_BS_CFG_BS_ALU_ALGO(2) | DPU_BS_CFG_BS_ALU_SRC(1) | + DPU_BS_CFG_BS_RELU_BYPASS(1) | + DPU_BS_CFG_BS_MUL_BYPASS(1)); + EMIT(REG_DPU_BS_ALU_CFG, 0); + EMIT(REG_DPU_BS_MUL_CFG, 0); + EMIT(REG_DPU_BS_RELUX_CMP_VALUE, 0); + + if (operation->depthwise) { + EMIT(REG_DPU_BS_OW_CFG, DPU_BS_OW_CFG_SIZE_E_2(3) | + DPU_BS_OW_CFG_SIZE_E_1(3) | + DPU_BS_OW_CFG_SIZE_E_0(3)); + } else { + EMIT(REG_DPU_BS_OW_CFG, DPU_BS_OW_CFG_SIZE_E_2(1) | + DPU_BS_OW_CFG_SIZE_E_1(1) | + DPU_BS_OW_CFG_SIZE_E_0(1)); + } + + EMIT(REG_DPU_BS_OW_OP, DPU_BS_OW_OP_OW_OP(0x80 - weights_zero_point)); + + EMIT(REG_DPU_WDMA_SIZE_0, + DPU_WDMA_SIZE_0_CHANNEL_WDMA(task->output_channels - 1)); + EMIT(REG_DPU_WDMA_SIZE_1, + DPU_WDMA_SIZE_1_HEIGHT_WDMA(task->output_height - 1) | + DPU_WDMA_SIZE_1_WIDTH_WDMA(task->output_width - 1)); + EMIT(REG_DPU_BN_CFG, + DPU_BN_CFG_BN_RELU_BYPASS(1) | DPU_BN_CFG_BN_MUL_BYPASS(1) | + DPU_BN_CFG_BN_ALU_BYPASS(1) | DPU_BN_CFG_BN_BYPASS(1)); + EMIT(REG_DPU_BN_ALU_CFG, 0); + EMIT(REG_DPU_BN_MUL_CFG, 0); + EMIT(REG_DPU_BN_RELUX_CMP_VALUE, 0); + + if (operation->add_tensor != -1) { + EMIT(REG_DPU_EW_CFG, + DPU_EW_CFG_EW_CVT_TYPE(1) | DPU_EW_CFG_EW_DATA_MODE(1) | + DPU_EW_CFG_EDATA_SIZE(1) | DPU_EW_CFG_EW_ALU_ALGO(2) | + DPU_EW_CFG_EW_RELU_BYPASS(1) | DPU_EW_CFG_EW_LUT_BYPASS(1) | + DPU_EW_CFG_EW_OP_SRC(1)); + + /* See http://nvdla.org/hw/v1/ias/precision.html#element-wise */ + EMIT(REG_DPU_EW_CVT_OFFSET_VALUE, operation->addition_offset); + + float add_scale = 0.0; + if (fabs(operation->addition_scale - 0.090192) < 0.00001) { + add_scale = 299.671889248; + } else if (fabs(operation->addition_scale - 0.399250) < 0.00001) { + add_scale = 1326.499209406; + } else if (fabs(operation->addition_scale - 0.364902) < 0.00001) { + add_scale = 780.34375; + } else if (fabs(operation->addition_scale - 0.422037) < 0.00001) { + add_scale = 715.5625; + } else if (fabs(operation->addition_scale - 0.213016) < 0.00001) { + add_scale = 564.6875; + } else if (fabs(operation->addition_scale - 0.244231) < 0.00001) { + add_scale = 499.796875; + } else if (fabs(operation->addition_scale - 0.283416) < 0.00001) { + add_scale = 488.203125; + } else if (fabs(operation->addition_scale - 0.171151) < 0.00001) { + add_scale = 602.90625; + } else if (fabs(operation->addition_scale - 0.164588) < 0.00001) { + add_scale = 271.921875; + } else if (fabs(operation->addition_scale - 0.204098) < 0.00001) { + add_scale = 262.90625; + } else if (fabs(operation->addition_scale - 0.116532) < 0.00001) { + add_scale = 450.140625; + } else if (fabs(operation->addition_scale - 0.134499) < 0.00001) { + add_scale = 212.1953125; + } else if (fabs(operation->addition_scale - 0.220141) < 0.00001) { + add_scale = 368.28125; + } else if (fabs(operation->addition_scale - 0.094560) < 0.00001) { + add_scale = 416.421875; + } else if (fabs(operation->addition_scale - 0.093230) < 0.00001) { + add_scale = 305.421875; + } else if (fabs(operation->addition_scale - 0.100618) < 0.00001) { + add_scale = 313.671875; + } else { + add_scale = 0.0; + } + + uint32_t add_scale_bits = fui(add_scale); + /* Taken from + * https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 + */ + unsigned add_shift = 127 + 31 - 32 - (add_scale_bits >> 23) + 16; + + unsigned scale = ((add_scale_bits >> 9) & 0x7fff); + if (scale < 1 << 14) + scale |= 1 << 14; + + EMIT(REG_DPU_EW_CVT_SCALE_VALUE, + DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SHIFT(add_shift - 1) | + DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SCALE(scale)); + + EMIT(REG_DPU_EW_RELUX_CMP_VALUE, 0x0); + + if (fabs(operation->addition_scale - 0.213016) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x4); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(25914)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.244231) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(28927)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.283416) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x6); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(26050)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.171151) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffffd); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(28937)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.164588) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(24877)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23)); + } else if (fabs(operation->addition_scale - 0.204098) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x0); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(23272)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23)); + } else if (fabs(operation->addition_scale - 0.116532) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffff8); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(32292)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.134499) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffffb); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(24153)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23)); + } else if (fabs(operation->addition_scale - 0.220141) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0xb); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(27655)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.094560) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x5); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(20432)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23)); + } else if (fabs(operation->addition_scale - 0.093230) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0xffffffff); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(25449)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23)); + } else if (fabs(operation->addition_scale - 0.100618) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, offset); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(16874)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23)); + } else if (fabs(operation->addition_scale - 0.422037) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(22559)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else if (fabs(operation->addition_scale - 0.364902) < 0.00001) { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x4); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(18589)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24)); + } else { + EMIT(REG_DPU_OUT_CVT_OFFSET, 0x6); + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(27676)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(25)); + } + } else { + EMIT(REG_DPU_EW_CFG, + DPU_EW_CFG_EW_RELU_BYPASS(1) | DPU_EW_CFG_EW_OP_CVT_BYPASS(1) | + DPU_EW_CFG_EW_LUT_BYPASS(1) | DPU_EW_CFG_EW_OP_BYPASS(1) | + DPU_EW_CFG_EW_BYPASS(1)); + EMIT(REG_DPU_EW_CVT_OFFSET_VALUE, 0); + EMIT(REG_DPU_EW_CVT_SCALE_VALUE, DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SCALE(1)); + EMIT(REG_DPU_EW_RELUX_CMP_VALUE, 0); + EMIT(REG_DPU_OUT_CVT_OFFSET, offset); + + float conv_scale = + (task->input_scale * task->weights_scale) / task->output_scale; + // DBG("conv_scale %f\n", conv_scale); + uint32_t scale_bits = fui(conv_scale); + /* Taken from + * https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 + */ + unsigned shift = 127 + 31 - 32 - (scale_bits >> 23) + 16; + + if (operation->truncate_bits > 0) + shift--; + + unsigned scale = ((scale_bits >> 9) & 0x7fff) + 1; + if (scale < 1 << 14) + scale |= 1 << 14; + + EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(scale)); + EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(shift - 1)); + } + + EMIT(REG_DPU_EW_OP_VALUE_0, 0); + EMIT(REG_DPU_EW_OP_VALUE_1, 0); + EMIT(REG_DPU_EW_OP_VALUE_2, 0); + EMIT(REG_DPU_EW_OP_VALUE_3, 0); + EMIT(REG_DPU_EW_OP_VALUE_4, 0); + EMIT(REG_DPU_EW_OP_VALUE_5, 0); + EMIT(REG_DPU_EW_OP_VALUE_6, 0); + EMIT(REG_DPU_EW_OP_VALUE_7, 0); + EMIT(REG_DPU_SURFACE_ADD, DPU_SURFACE_ADD_SURF_ADD(task->surfaces_per_row)); + emit_raw(regs, DPU | 0x1, 0x40c4, 0); + EMIT(REG_DPU_LUT_ACCESS_CFG, 0); + EMIT(REG_DPU_LUT_ACCESS_DATA, 0); + EMIT(REG_DPU_LUT_CFG, 0); + EMIT(REG_DPU_LUT_INFO, 0); + EMIT(REG_DPU_LUT_LE_START, 0); + EMIT(REG_DPU_LUT_LE_END, 0); + EMIT(REG_DPU_LUT_LO_START, 0); + EMIT(REG_DPU_LUT_LO_END, 0); + EMIT(REG_DPU_LUT_LE_SLOPE_SCALE, 0); + EMIT(REG_DPU_LUT_LE_SLOPE_SHIFT, 0); + EMIT(REG_DPU_LUT_LO_SLOPE_SCALE, 0); + EMIT(REG_DPU_LUT_LO_SLOPE_SHIFT, 0); + EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_WIDTH, + DPU_RDMA_RDMA_DATA_CUBE_WIDTH_WIDTH(task->output_width - 1)); + EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_HEIGHT, + DPU_RDMA_RDMA_DATA_CUBE_HEIGHT_HEIGHT(task->output_height - 1)); + EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_CHANNEL, + DPU_RDMA_RDMA_DATA_CUBE_CHANNEL_CHANNEL(task->output_channels - 1)); + + if (operation->add_tensor != -1) { + EMIT(REG_DPU_RDMA_RDMA_SRC_BASE_ADDR, + rkt_get_tensor(subgraph, operation->add_tensor)->phys_addr + + task->output_offset); + } else { + EMIT(REG_DPU_RDMA_RDMA_SRC_BASE_ADDR, 0); + } + + EMIT(REG_DPU_RDMA_RDMA_BRDMA_CFG, DPU_RDMA_RDMA_BRDMA_CFG_BRDMA_DATA_USE(1)); + EMIT(REG_DPU_RDMA_RDMA_BS_BASE_ADDR, + rkt_resource(operation->biases)->phys_addr); + EMIT(REG_DPU_RDMA_RDMA_NRDMA_CFG, 0); + EMIT(REG_DPU_RDMA_RDMA_BN_BASE_ADDR, 0); + + unsigned ew_stride = + MAX2(operation->output_width * operation->output_height, 12); + + if (operation->add_tensor != -1) { + EMIT(REG_DPU_RDMA_RDMA_ERDMA_CFG, + DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DATA_MODE(1) | + DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DATA_SIZE(1)); + unsigned ew_base_offset = + operation->output_width * operation->output_height * ATOMIC_K_SIZE; + EMIT(REG_DPU_RDMA_RDMA_EW_BASE_ADDR, + rkt_get_tensor(subgraph, operation->add_tensor)->phys_addr + + task->output_offset + ew_base_offset); + EMIT(REG_DPU_RDMA_RDMA_EW_SURF_STRIDE, + DPU_RDMA_RDMA_EW_SURF_STRIDE_EW_SURF_STRIDE(ew_stride)); + } else { + EMIT(REG_DPU_RDMA_RDMA_ERDMA_CFG, DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DISABLE(1)); + EMIT(REG_DPU_RDMA_RDMA_EW_BASE_ADDR, 0); + EMIT(REG_DPU_RDMA_RDMA_EW_SURF_STRIDE, 0); + } + + uint32_t rdma_feat_mode_cfg = 0x0; + + if (operation->add_tensor != -1) { + rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_BURST_LEN(15) | + DPU_RDMA_RDMA_FEATURE_MODE_CFG_COMB_USE(5); + } else { + rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_BURST_LEN(15) | + DPU_RDMA_RDMA_FEATURE_MODE_CFG_MRDMA_DISABLE(1); + } + + if (operation->depthwise) + rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_CONV_MODE(3); + + EMIT(REG_DPU_RDMA_RDMA_FEATURE_MODE_CFG, rdma_feat_mode_cfg); + EMIT(REG_DPU_RDMA_RDMA_SRC_DMA_CFG, 0); + + unsigned surf_notch = + ew_stride + + task->output_width * (operation->output_height - task->output_height); + + if (operation->input_width == 3) { + surf_notch = 15; + } + + if (operation->add_tensor != -1) { + EMIT(REG_DPU_RDMA_RDMA_SURF_NOTCH, + DPU_RDMA_RDMA_SURF_NOTCH_SURF_NOTCH_ADDR(surf_notch)); + } else { + EMIT(REG_DPU_RDMA_RDMA_SURF_NOTCH, 0); + } + + EMIT(REG_DPU_RDMA_RDMA_PAD_CFG, 0); + EMIT(REG_DPU_RDMA_RDMA_WEIGHT, + DPU_RDMA_RDMA_WEIGHT_E_WEIGHT(1) | DPU_RDMA_RDMA_WEIGHT_N_WEIGHT(1) | + DPU_RDMA_RDMA_WEIGHT_B_WEIGHT(1) | DPU_RDMA_RDMA_WEIGHT_M_WEIGHT(1)); + + if (operation->add_tensor != -1) { + EMIT(REG_DPU_RDMA_RDMA_EW_SURF_NOTCH, + DPU_RDMA_RDMA_EW_SURF_NOTCH_EW_SURF_NOTCH(surf_notch)); + } else { + EMIT(REG_DPU_RDMA_RDMA_EW_SURF_NOTCH, 0x0); + } + + if (num_tasks == 1) + util_dynarray_append(regs, uint64_t, 0x0); + else + EMIT(REG_PC_BASE_ADDRESS, 0); + + EMIT(REG_PC_REGISTER_AMOUNTS, 0); + + /* TRM: before op_en, 64'h0041_xxxx_xxxx_xxxx must be set. */ + util_dynarray_append(regs, uint64_t, 0x0041000000000000); + + /* TRM: 64'h0081_0000_007f_0008 will set each block's op_en(CNA, CORE, ..., + * PPU_RDMA). */ + emit_raw(regs, 0x81, REG_PC_OPERATION_ENABLE, + PC_OPERATION_ENABLE_RESERVED_0(14) | PC_OPERATION_ENABLE_OP_EN(1)); +} + +void +rkt_fill_regcmd(struct rkt_ml_subgraph *subgraph, + const struct rkt_operation *operation, + struct util_dynarray *regs, unsigned task_num) +{ + /* + * TODO: We should only need to set all the registers on the regcmd for the first + * task in an operation, but for now set them all to be sure. + */ + fill_first_regcmd(subgraph, operation, regs, task_num); +}
\ No newline at end of file diff --git a/src/gallium/drivers/rocket/rkt_regcmd.h b/src/gallium/drivers/rocket/rkt_regcmd.h new file mode 100644 index 00000000000..ee755e78a97 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_regcmd.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#ifndef RKT_REGCMD_H +#define RKT_REGCMD_H + +#include "rkt_ml.h" + +void rkt_fill_regcmd(struct rkt_ml_subgraph *subgraph, + const struct rkt_operation *operation, + struct util_dynarray *regs, unsigned task_num); + +#endif /* RKT_REGCMD_H */ diff --git a/src/gallium/drivers/rocket/rkt_task.c b/src/gallium/drivers/rocket/rkt_task.c new file mode 100644 index 00000000000..6dbb2784f40 --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_task.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#include "rkt_task.h" +#include "rkt_ml.h" + +static unsigned +calc_entries_per_slice(struct rkt_operation *operation) +{ + unsigned bpe = sizeof(uint8_t); + unsigned atomics_per_entry = CBUF_ENTRY_SIZE / FEATURE_ATOMIC_SIZE; + unsigned total_c_atomics = + DIV_ROUND_UP(operation->input_channels * bpe, FEATURE_ATOMIC_SIZE); + unsigned last_c_atomics = total_c_atomics % atomics_per_entry; + unsigned int_c_entries = + (total_c_atomics / atomics_per_entry) * operation->input_width; + unsigned frac_c_entries = + (last_c_atomics == 3) + ? operation->input_width + : DIV_ROUND_UP(last_c_atomics * operation->input_width, + atomics_per_entry); + + return int_c_entries + frac_c_entries; +} + +static unsigned +calc_input_banks(struct rkt_operation *operation) +{ + unsigned entries_per_slice = calc_entries_per_slice(operation); + return DIV_ROUND_UP(entries_per_slice * operation->input_height, + CBUF_ENTRIES_PER_BANK); +} + +static unsigned +calc_weights_banks(struct rkt_operation *operation) +{ + unsigned bpe = sizeof(uint8_t); + unsigned bytes = operation->weights_width * operation->weights_height * + operation->input_channels * bpe; + unsigned entries; + unsigned banks; + + if (!operation->depthwise) + bytes *= operation->output_channels; + entries = DIV_ROUND_UP(bytes, CBUF_ENTRY_SIZE); + banks = DIV_ROUND_UP(entries, CBUF_ENTRIES_PER_BANK); + + /* Why do we need an extra bank? The calc above might be wrong on this HW */ + banks++; + + return banks; +} + +static unsigned +calc_line_stride(unsigned width) +{ + return width * ATOMIC_K_SIZE * sizeof(uint8_t); +} + +static void +calc_explicit_padding(const struct rkt_operation *operation, + unsigned *pad_top, unsigned *pad_bottom, + unsigned *pad_left, unsigned *pad_right) +{ + if (operation->padding_same && operation->weights_width > 1) { + /* Convert from implicit to explicit padding */ + unsigned pad_along_width = + MAX2((operation->output_width - 1) * operation->stride + + operation->weights_width - operation->input_width, + 0); + unsigned pad_along_height = + MAX2((operation->output_height - 1) * operation->stride + + operation->weights_height - operation->input_height, + 0); + *pad_left = pad_along_height / 2; + *pad_right = pad_along_height - *pad_left; + *pad_top = pad_along_width / 2; + *pad_bottom = pad_along_width - *pad_top; + } else { + *pad_left = 0; + *pad_right = 0; + *pad_top = 0; + *pad_bottom = 0; + } +} + +static void +fill_task(struct rkt_ml_subgraph *subgraph, + struct rkt_operation *operation, + struct split_task *task) +{ + task->stride_x = operation->stride; + task->stride_y = operation->stride; + + task->input_width = operation->input_width; + if (task->input_width == 8 && + (operation->addition_input || operation->add_tensor != -1)) + task->input_width *= 2; + + task->input_height = operation->input_height; + task->input_channels = + ALIGN(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE), + FEATURE_ATOMIC_SIZE); + task->input_channels_real = operation->input_channels; + task->input_zero_point = operation->input_zero_point; + task->input_scale = operation->input_scale; + + task->output_width = operation->output_width; + task->output_height = operation->output_height; + + task->output_channels_real = operation->output_channels; + task->output_channels = ALIGN(MAX2(operation->output_channels, 32), 32); + if (operation->depthwise) { + if (task->output_channels_real <= 32) + task->output_channels *= 2; + task->output_channels = ALIGN(task->output_channels, 64); + } + + task->output_zero_point = operation->output_zero_point; + task->output_scale = operation->output_scale; + + if (task->input_channels_real == 1 && + (task->output_channels_real > 1 || + (operation->addition_input || operation->add_tensor != -1))) { + task->input_width = MAX2(task->input_width, FEATURE_ATOMIC_SIZE); + task->input_line_stride = + MAX2(calc_line_stride(operation->input_width) / FEATURE_ATOMIC_SIZE, + FEATURE_ATOMIC_SIZE); + + if (operation->input_channels == 32 && operation->input_width == 80) { + task->input_line_stride *= 4; + task->input_surface_stride = (float)task->input_line_stride * + (((float)task->input_height / 4) - 1); + } else + task->input_surface_stride = + (float)task->input_line_stride * (((float)task->input_height) - 1); + } else { + task->input_line_stride = calc_line_stride(operation->input_width) / 4; + task->input_surface_stride = + (float)task->input_line_stride * (((float)task->input_height / 4) - 1); + } + + if (task->input_width == 8 && + (operation->addition_input || operation->add_tensor != -1)) { + task->input_line_stride /= 2; + task->input_surface_stride = 112; + } + + int output_line_stride = calc_line_stride(operation->output_width); + task->output_surface_stride = output_line_stride * task->output_height; + task->output_surface_stride /= FEATURE_ATOMIC_SIZE; + + if (task->input_channels_real == 1) + task->input_data_entries = task->input_width * task->input_height; + else if (task->input_width == 40 && task->input_channels_real == 40) + task->input_data_entries = 40; + else + task->input_data_entries = DIV_ROUND_UP( + task->input_width * 2 * + DIV_ROUND_UP(task->input_channels_real, FEATURE_ATOMIC_SIZE), + 8); + + task->weights_width = operation->weights_width; + task->weights_height = operation->weights_height; + task->weights_zero_point = operation->weights_zero_point; + task->weights_scale = operation->weights_scale; + + if (operation->depthwise) + task->weights_kernels = 1; + else + task->weights_kernels = ALIGN(operation->output_channels, 2); + + task->surfaces_per_row = task->output_width * task->output_height * 2; + if (operation->depthwise) + task->surfaces_per_row *= 2; +} + +void +rkt_split_tasks(struct rkt_ml_subgraph *subgraph, + struct rkt_operation *operation) +{ + /* Function mostly taken from NVDLA */ + unsigned entries_per_slice = calc_entries_per_slice(operation); + unsigned input_banks_required = calc_input_banks(operation); + unsigned weights_banks_required = calc_weights_banks(operation); + unsigned available_weights_banks = weights_banks_required; + unsigned available_input_banks = CBUF_BANKS - weights_banks_required; + unsigned pad_top; + unsigned pad_bottom; + unsigned pad_left; + unsigned pad_right; + + calc_explicit_padding(operation, &pad_top, &pad_bottom, &pad_left, + &pad_right); + + if (weights_banks_required + 1 < CBUF_BANKS) { + /* Full weights, partial input */ + operation->reuse_weights_cbuf = true; + } else { + /* Partial weights, partial input */ + operation->reuse_weights_cbuf = false; + available_input_banks = 7; + available_weights_banks = CBUF_BANKS - available_input_banks; + } + + if (input_banks_required <= available_input_banks) { + /* Full weights, full input */ + + struct split_task task = {0}; + + task.num = 0; + fill_task(subgraph, operation, &task); + task.input_banks = input_banks_required; + task.weights_banks = CBUF_BANKS - task.input_banks; + task.input_height = operation->input_height; + + task.pad_top = pad_top; + task.pad_bottom = pad_bottom; + task.pad_left = pad_left; + task.pad_right = pad_right; + + task.atomic_count = task.output_width * task.output_height; + + util_dynarray_append(&operation->tasks, struct split_task, task); + + return; + } + + struct split_task task = {0}; + unsigned available_slices = + (CBUF_ENTRIES_PER_BANK * available_input_banks) / entries_per_slice; + + task.num = 0; + fill_task(subgraph, operation, &task); + task.input_banks = available_input_banks; + task.weights_banks = available_weights_banks; + + task.top_slice = 0; + task.bottom_slice = available_slices - 1; + + task.pad_top = pad_top; + task.pad_left = pad_left; + task.pad_right = pad_right; + + util_dynarray_append(&operation->tasks, struct split_task, task); + + for (unsigned slice = operation->weights_height - pad_top - 1; + slice < operation->input_height;) { + memset(&task, 0, sizeof(task)); + + struct split_task *prev_task = util_dynarray_element( + &operation->tasks, struct split_task, + util_dynarray_num_elements(&operation->tasks, struct split_task) - 1); + + while (slice <= prev_task->bottom_slice) { + slice += operation->stride; + } + if (slice > prev_task->bottom_slice) { + slice -= operation->stride; + } + + task.num = util_dynarray_num_elements(&operation->tasks, struct split_task); + fill_task(subgraph, operation, &task); + task.top_slice = MIN2(slice, prev_task->bottom_slice) - + (operation->weights_height - 1) + operation->stride; + task.bottom_slice = task.top_slice + available_slices - 1; + task.pad_left = pad_left; + task.pad_right = pad_right; + + // check if current task is the last one + if (task.bottom_slice >= operation->input_height - 1) { + task.bottom_slice = operation->input_height - 1; + task.pad_bottom = pad_bottom; + util_dynarray_append(&operation->tasks, struct split_task, task); + break; + } + + slice = task.top_slice + operation->weights_height - 1; + util_dynarray_append(&operation->tasks, struct split_task, task); + } + + struct split_task *last_task = util_dynarray_element( + &operation->tasks, struct split_task, + util_dynarray_num_elements(&operation->tasks, struct split_task) - 1); + if (last_task->top_slice >= operation->input_height || + last_task->bottom_slice >= (operation->input_height + pad_bottom)) { + (void)util_dynarray_pop_ptr(&operation->tasks, struct split_task); + } + + // determine overlap slices between 2 split chunks + for (int i = 1; + i < util_dynarray_num_elements(&operation->tasks, struct split_task); + i++) { + struct split_task *prev_task = + util_dynarray_element(&operation->tasks, struct split_task, i - 1); + struct split_task *cur_task = + util_dynarray_element(&operation->tasks, struct split_task, i); + + if (prev_task->bottom_slice >= cur_task->top_slice) { + cur_task->num_overlap_slices = + prev_task->bottom_slice - cur_task->top_slice + 1; + prev_task->num_retain_slices = cur_task->num_overlap_slices; + } else { + cur_task->num_overlap_slices = 0; + prev_task->num_retain_slices = 0; + } + } + + unsigned output_height_processed = 0; + for (int i = 0; + i < util_dynarray_num_elements(&operation->tasks, struct split_task); + i++) { + struct split_task *cur_task = + util_dynarray_element(&operation->tasks, struct split_task, i); + + unsigned slice = cur_task->top_slice + (operation->weights_height - 1) - + cur_task->pad_top; + + while (slice <= cur_task->bottom_slice + cur_task->pad_bottom) { + slice += operation->stride; + cur_task->convolutions++; + } + + cur_task->bottom_slice = + MIN2(cur_task->bottom_slice, operation->input_height - 1); + + cur_task->input_height = cur_task->bottom_slice - cur_task->top_slice + 1; + + cur_task->output_width = (cur_task->input_width + cur_task->pad_left + + cur_task->pad_right - operation->weights_width) / + operation->stride + + 1; + cur_task->output_height = + (cur_task->input_height + cur_task->pad_top + cur_task->pad_bottom - + operation->weights_height) / + operation->stride + + 1; + cur_task->atomic_count = cur_task->output_width * cur_task->output_height; + + cur_task->input_offset = + calc_line_stride(operation->input_width) * cur_task->top_slice; + cur_task->output_offset = + calc_line_stride(operation->output_width) * output_height_processed; + + cur_task->input_banks = available_input_banks; + cur_task->weights_banks = available_weights_banks; + + output_height_processed += cur_task->output_height; + } +} diff --git a/src/gallium/drivers/rocket/rkt_task.h b/src/gallium/drivers/rocket/rkt_task.h new file mode 100644 index 00000000000..84bb9aa577e --- /dev/null +++ b/src/gallium/drivers/rocket/rkt_task.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> + * SPDX-License-Identifier: MIT + */ + +#ifndef RKT_TASK_H +#define RKT_TASK_H + +#include "rkt_ml.h" + +void rkt_split_tasks(struct rkt_ml_subgraph *subgraph, + struct rkt_operation *operation); + +#endif /* RKT_TASK_H */
\ No newline at end of file diff --git a/src/gallium/drivers/rocket/rules-ng.xsd b/src/gallium/drivers/rocket/rules-ng.xsd new file mode 100644 index 00000000000..414dee1d746 --- /dev/null +++ b/src/gallium/drivers/rocket/rules-ng.xsd @@ -0,0 +1,457 @@ +<?xml version="1.0" encoding="UTF-8"?> +<schema xmlns="http://www.w3.org/2001/XMLSchema" + targetNamespace="http://nouveau.freedesktop.org/" + xmlns:rng="http://nouveau.freedesktop.org/" + elementFormDefault="qualified"> + + <annotation> + <documentation> + An updated version of the old rules.xml file from the + RivaTV project. Specifications by Pekka Paalanen, + preliminary attempt by KoalaBR, + first working version by Jakob Bornecrantz. + For specifications, see the file rules-ng-format.txt + in Nouveau CVS module 'rules-ng'. + </documentation> + <documentation>Version 0.1</documentation> + </annotation> + + + <!-- Elements --> + + <element name="database" type="rng:databaseType" /> + <element name="import" type="rng:importType" /> + <element name="copyright" type="rng:copyrightType" /> + <element name="domain" type="rng:domainType" /> + <element name="group" type="rng:groupType" /> + <element name="use-group" type="rng:refType" /> + <element name="array" type="rng:arrayType" /> + <element name="stripe" type="rng:stripeType" /> + <element name="reg64" type="rng:registerType" /> + <element name="reg32" type="rng:registerType" /> + <element name="reg16" type="rng:registerType" /> + <element name="reg8" type="rng:registerType" /> + <element name="bitset" type="rng:bitsetType" /> + <element name="bitfield" type="rng:bitfieldType" /> + <element name="enum" type="rng:enumType" /> + <element name="value" type="rng:valueType" /> + + <!-- Copyright elements --> + <element name="author" type="rng:authorType" /> + <element name="nick" type="rng:nickType" /> + <element name="license" type="rng:docType" /> + + <!-- Documentation elements --> + + <!-- FIXME: allowed only one per parent element --> + <element name="brief" type="rng:briefType" /> + + <element name="doc" type="rng:docType" /> + <element name="b" type="rng:textformatType" /> + <element name="i" type="rng:textformatType" /> + <element name="u" type="rng:textformatType" /> + <element name="code" type="rng:textcodeType" /> + <element name="ul" type="rng:listType" /> + <element name="ol" type="rng:listType" /> + <element name="li" type="rng:listitemType" /> + + <!-- Copyright element types --> + + <complexType name="authorType" mixed="true"> + <annotation> + <documentation> + register database author + </documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <element ref="rng:nick" /> + </choice> + <attribute name="name" type="string" use="required" /> + <attribute name="email" type="string" use="required" /> + </complexType> + + <complexType name="nickType"> + <annotation> + <documentation>nickType</documentation> + </annotation> + <attribute name="name" type="string" use="required" /> + </complexType> + + <!-- Database element types --> + + <complexType name="databaseType"> + <annotation> + <documentation>databaseType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + </choice> + </complexType> + + <complexType name="importType"> + <annotation> + <documentation>importType</documentation> + </annotation> + <attribute name="file" type="string" use="required" /> + </complexType> + + <complexType name="copyrightType"> + <annotation> + <documentation>copyrightType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + <element ref="rng:author" /> + <element ref="rng:license" /> + </choice> + <attribute name="year" type="nonNegativeInteger" use="optional" /> + </complexType> + + <complexType name="domainType"> + <annotation> + <documentation>domainType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + <group ref="rng:regarrayGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + <attribute name="bare" type="rng:Boolean" use="optional" /> + <attribute name="prefix" type="NMTOKENS" use="optional" /> + <attribute name="width" type="rng:DomainWidth" use="optional" /> + <attribute name="size" type="rng:HexOrNumber" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + <attribute name="variants" type="string" use="optional" /> + </complexType> + + <complexType name="groupType"> + <annotation> + <documentation>groupType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + <group ref="rng:regarrayGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + </complexType> + + <complexType name="arrayType"> + <annotation> + <documentation>arrayType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + <group ref="rng:regarrayGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="optional" /> + <attribute name="offset" type="rng:HexOrNumber" use="optional" /> + <attribute name="offsets" type="string" use="optional"/> + <attribute name="doffsets" type="string" use="optional"/> + <attribute name="index" type="NMTOKENS" use="optional"/> + <attribute name="stride" type="rng:HexOrNumber" use="required" /> + <attribute name="length" type="rng:HexOrNumber" use="required" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + <attribute name="variants" type="string" use="optional" /> + <attribute name="usage" type="string" use="optional" /> + </complexType> + + <complexType name="stripeType"> + <annotation> + <documentation>stripeType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + <group ref="rng:regarrayGroup" minOccurs="0" /> + </choice> + <attribute name="name" type="NMTOKEN" use="optional" /> + <attribute name="offset" type="rng:HexOrNumber" use="optional" /> + <attribute name="stride" type="rng:HexOrNumber" use="optional" /> + <attribute name="length" type="rng:HexOrNumber" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + <attribute name="variants" type="string" use="optional" /> + <attribute name="prefix" type="NMTOKENS" use="optional" /> + </complexType> + + <complexType name="registerType"> + <annotation> + <documentation> + registerType used by reg8, reg16, reg32, reg64 + </documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + <element ref="rng:value" /> + <element ref="rng:bitfield" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + <attribute name="offset" type="rng:HexOrNumber" use="required" /> + <attribute name="access" type="rng:Access" default="rw" use="optional" /> + <attribute name="type" type="NMTOKENS" use="optional" /> + <attribute name="shr" type="nonNegativeInteger" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + <attribute name="variants" type="string" use="optional" /> + <attribute name="stride" type="rng:HexOrNumber" use="optional" /> + <attribute name="length" type="rng:HexOrNumber" use="optional" /> + <attribute name="high" type="nonNegativeInteger" use="optional" /> + <attribute name="low" type="nonNegativeInteger" use="optional" /> + <attribute name="pos" type="nonNegativeInteger" use="optional" /> + <attribute name="align" type="nonNegativeInteger" use="optional" /> + <attribute name="radix" type="nonNegativeInteger" use="optional" /> + <attribute name="usage" type="string" use="optional" /> + </complexType> + + <complexType name="bitsetType"> + <annotation> + <documentation>bitsetType</documentation> + </annotation> + <choice maxOccurs="unbounded"> + <element ref="rng:bitfield" /> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + <attribute name="inline" type="rng:Boolean" use="optional" /> + <attribute name="bare" type="rng:Boolean" use="optional" /> + <attribute name="prefix" type="NMTOKENS" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + </complexType> + + <complexType name="bitfieldType"> + <annotation> + <documentation>bitfieldType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <element ref="rng:value" maxOccurs="unbounded" /> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + <attribute name="high" type="nonNegativeInteger" use="optional" /> + <attribute name="low" type="nonNegativeInteger" use="optional" /> + <attribute name="pos" type="nonNegativeInteger" use="optional" /> + <attribute name="radix" type="nonNegativeInteger" use="optional" /> + <attribute name="align" type="nonNegativeInteger" use="optional" /> + <attribute name="type" type="NMTOKENS" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + <attribute name="variants" type="string" use="optional" /> + <attribute name="addvariant" type="rng:Boolean" use="optional" /> + <attribute name="shr" type="nonNegativeInteger" use="optional" /> + </complexType> + + <complexType name="enumType"> + <annotation> + <documentation>enumType</documentation> + </annotation> + <choice maxOccurs="unbounded"> + <element ref="rng:value" /> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + <attribute name="inline" type="rng:Boolean" use="optional" /> + <attribute name="bare" type="rng:Boolean" use="optional" /> + <attribute name="prefix" type="NMTOKENS" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + </complexType> + + <complexType name="valueType"> + <annotation> + <documentation>valueType</documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:docGroup" /> + <group ref="rng:topGroup" /> + </choice> + <attribute name="name" type="NMTOKEN" use="required" /> + <attribute name="value" type="string" use="optional" /> + <attribute name="varset" type="NMTOKEN" use="optional" /> + <attribute name="variants" type="string" use="optional" /> + </complexType> + + <complexType name="refType"> + <annotation> + <documentation>refType</documentation> + </annotation> + <attribute name="ref" type="NMTOKEN" use="required" /> + </complexType> + + + <!-- Documentation element types --> + + <complexType name="briefType"> + <annotation> + <documentation> + brief documentation, no markup + </documentation> + </annotation> + <simpleContent> + <extension base="string" /> + </simpleContent> + </complexType> + + <complexType name="docType" mixed="true"> + <annotation> + <documentation> + root element of documentation sub-tree + </documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:textformatGroup" /> + <group ref="rng:listGroup" /> + <element ref="rng:code" /> + </choice> + </complexType> + + <complexType name="textformatType" mixed="true"> + <annotation> + <documentation> + for bold, underline, italics + </documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:textformatGroup" /> + </choice> + </complexType> + + <complexType name="textcodeType"> + <simpleContent> + <extension base="string"> + <attribute name="title" type="string" /> + </extension> + </simpleContent> + </complexType> + + <complexType name="listType"> + <annotation> + <documentation> + definition of a list, ordered or unordered + </documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <element ref="rng:li" /> + </choice> + </complexType> + + <complexType name="listitemType" mixed="true"> + <annotation> + <documentation> + items of a list + </documentation> + </annotation> + <choice minOccurs="0" maxOccurs="unbounded"> + <group ref="rng:textformatGroup" /> + <group ref="rng:listGroup" /> + <element ref="rng:code" /> + </choice> + </complexType> + + + + <!-- Attribute value types --> + + <simpleType name="Hexadecimal"> + <restriction base="string"> + <pattern value="0x[0-9a-f]+" /> + <pattern value="0x[0-9A-F]+" /> + <pattern value="[0-9]" /> + </restriction> + </simpleType> + + <simpleType name="HexOrNumber"> + <annotation> + <documentation>HexOrNumber</documentation> + </annotation> + <union memberTypes="rng:Hexadecimal nonNegativeInteger" /> + </simpleType> + + <simpleType name="Boolean"> + <restriction base="string"> + <enumeration value="true" /> + <enumeration value="1" /> + <enumeration value="yes" /> + <enumeration value="false" /> + <enumeration value="0" /> + <enumeration value="no" /> + </restriction> + </simpleType> + + <simpleType name="Access"> + <annotation> + <documentation>Access</documentation> + </annotation> + <restriction base="string"> + <enumeration value="r" /> + <enumeration value="w" /> + <enumeration value="rw" /> + </restriction> + </simpleType> + + <simpleType name="DomainWidth"> + <annotation> + <documentation>DomainWidth</documentation> + </annotation> + <restriction base="string"> + <enumeration value="8" /> + <enumeration value="16" /> + <enumeration value="32" /> + <enumeration value="64" /> + </restriction> + </simpleType> + + + + <!-- Element groups --> + + <group name="topGroup"> + <choice> + <element ref="rng:copyright" /> + <element ref="rng:domain" /> + <element ref="rng:enum" /> + <element ref="rng:group" /> + <element ref="rng:bitset" /> + <element ref="rng:import" /> + </choice> + </group> + + <group name="regarrayGroup"> + <choice> + <element ref="rng:reg64" /> + <element ref="rng:reg32" /> + <element ref="rng:reg16" /> + <element ref="rng:reg8" /> + <element ref="rng:array" /> + <element ref="rng:stripe" /> + <element ref="rng:use-group" /> + </choice> + </group> + + <group name="docGroup"> + <choice> + <element ref="rng:brief" /> + <element ref="rng:doc" /> + </choice> + </group> + + <group name="textformatGroup"> + <choice> + <element ref="rng:b" /> + <element ref="rng:i" /> + <element ref="rng:u" /> + </choice> + </group> + + <group name="listGroup"> + <choice> + <element ref="rng:ul" /> + <element ref="rng:ol" /> + </choice> + </group> + +</schema> diff --git a/src/gallium/meson.build b/src/gallium/meson.build index c26e98e6f54..6ba60851984 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -185,6 +185,12 @@ if with_gallium_lima else driver_lima = declare_dependency() endif +if with_gallium_rocket + subdir('winsys/rocket/drm') + subdir('drivers/rocket') +else + driver_rocket = declare_dependency() +endif if with_gallium_zink subdir('drivers/zink') else diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build index 5ecdc7be6e7..6d3ccafc63e 100644 --- a/src/gallium/targets/dri/meson.build +++ b/src/gallium/targets/dri/meson.build @@ -62,7 +62,7 @@ libgallium_dri = shared_library( driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv, driver_tegra, driver_i915, driver_svga, driver_virgl, driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12, - driver_asahi, driver_crocus + driver_asahi, driver_crocus, driver_rocket ], install : true, name_suffix : libname_suffix, diff --git a/src/gallium/targets/dril/meson.build b/src/gallium/targets/dril/meson.build index 556047c2638..80cbef3f039 100644 --- a/src/gallium/targets/dril/meson.build +++ b/src/gallium/targets/dril/meson.build @@ -124,7 +124,8 @@ foreach d : [[with_gallium_kmsro, [ [with_gallium_lima, 'lima_dri.so'], [with_gallium_d3d12, 'd3d12_dri.so'], [with_gallium_zink, 'zink_dri.so'], - [with_gallium_asahi, 'asahi_dri.so']] + [with_gallium_asahi, 'asahi_dri.so'], + [with_gallium_rocket, 'rocket_dri.so']] if d[0] dril_drivers += d[1] endif diff --git a/src/gallium/winsys/rocket/drm/meson.build b/src/gallium/winsys/rocket/drm/meson.build new file mode 100644 index 00000000000..55f65803810 --- /dev/null +++ b/src/gallium/winsys/rocket/drm/meson.build @@ -0,0 +1,13 @@ +# Copyright 2017 Broadcom +# SPDX-License-Identifier: MIT + +librocketwinsys = static_library( + 'rocketwinsys', + files('rkt_drm_winsys.c'), + include_directories : [ + inc_src, inc_include, + inc_gallium, inc_gallium_aux, inc_gallium_drivers, + ], + gnu_symbol_visibility : 'hidden', + dependencies: [idep_mesautil], +) diff --git a/src/gallium/winsys/rocket/drm/rkt_drm_public.h b/src/gallium/winsys/rocket/drm/rkt_drm_public.h new file mode 100644 index 00000000000..5138801758f --- /dev/null +++ b/src/gallium/winsys/rocket/drm/rkt_drm_public.h @@ -0,0 +1,17 @@ +/* + * Copyright 2014 Broadcom + * Copyright 2018 Alyssa Rosenzweig + * Copyright 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef __RKT_DRM_PUBLIC_H__ +#define __RKT_DRM_PUBLIC_H__ + +struct pipe_screen; +struct pipe_screen_config; + +struct pipe_screen * +rkt_drm_screen_create(int drmFD, const struct pipe_screen_config *config); + +#endif /* __RKT_DRM_PUBLIC_H__ */ diff --git a/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c b/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c new file mode 100644 index 00000000000..cbba3534e1a --- /dev/null +++ b/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c @@ -0,0 +1,19 @@ +/* + * Copyright 2014 Broadcom + * Copyright 2018 Alyssa Rosenzweig + * Copyright 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "util/os_file.h" +#include "util/u_screen.h" + +#include "rocket/rkt_device.h" +#include "rkt_drm_public.h" + +struct pipe_screen * +rkt_drm_screen_create(int fd, const struct pipe_screen_config *config) +{ + return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL, + rkt_screen_create); +} |