Bladeren bron

[DM/FEATURE] Support NVME (#9591)

* [DM/FEATURE] Support NVME

1. Support PRP and SGL (>= NVME v1.1) transport.
2. Support MSI/MSI-X for IO queues.
3. Support NVME on PCI.

Signed-off-by: GuEe-GUI <2991707448@qq.com>
GUI 5 maanden geleden
bovenliggende
commit
945114fd59

+ 1 - 0
components/drivers/Kconfig

@@ -22,6 +22,7 @@ rsource "graphic/Kconfig"
 rsource "hwcrypto/Kconfig"
 rsource "wlan/Kconfig"
 rsource "block/Kconfig"
+rsource "nvme/Kconfig"
 rsource "scsi/Kconfig"
 rsource "virtio/Kconfig"
 rsource "dma/Kconfig"

+ 899 - 0
components/drivers/include/drivers/nvme.h

@@ -0,0 +1,899 @@
+/*
+ * Copyright (c) 2006-2023, RT-Thread Development Team
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2023-02-25     GuEe-GUI     the first version
+ */
+
+#ifndef __NVME_H__
+#define __NVME_H__
+
+#include <rthw.h>
+#include <rtthread.h>
+#include <drivers/blk.h>
+
+#define NVME_RSVD(offset, bytes_size)   rt_uint8_t __rsvd##offset[bytes_size]
+
+enum
+{
+    /*
+     * Generic Command Status:
+     */
+    RT_NVME_SC_SUCCESS                          = 0x0,
+    RT_NVME_SC_INVALID_OPCODE                   = 0x1,
+    RT_NVME_SC_INVALID_FIELD                    = 0x2,
+    RT_NVME_SC_CMDID_CONFLICT                   = 0x3,
+    RT_NVME_SC_DATA_XFER_ERROR                  = 0x4,
+    RT_NVME_SC_POWER_LOSS                       = 0x5,
+    RT_NVME_SC_INTERNAL                         = 0x6,
+    RT_NVME_SC_ABORT_REQ                        = 0x7,
+    RT_NVME_SC_ABORT_QUEUE                      = 0x8,
+    RT_NVME_SC_FUSED_FAIL                       = 0x9,
+    RT_NVME_SC_FUSED_MISSING                    = 0xa,
+    RT_NVME_SC_INVALID_NS                       = 0xb,
+    RT_NVME_SC_CMD_SEQ_ERROR                    = 0xc,
+    RT_NVME_SC_SGL_INVALID_LAST                 = 0xd,
+    RT_NVME_SC_SGL_INVALID_COUNT                = 0xe,
+    RT_NVME_SC_SGL_INVALID_DATA                 = 0xf,
+    RT_NVME_SC_SGL_INVALID_METADATA             = 0x10,
+    RT_NVME_SC_SGL_INVALID_TYPE                 = 0x11,
+    RT_NVME_SC_CMB_INVALID_USE                  = 0x12,
+    RT_NVME_SC_PRP_INVALID_OFFSET               = 0x13,
+    RT_NVME_SC_ATOMIC_WU_EXCEEDED               = 0x14,
+    RT_NVME_SC_OP_DENIED                        = 0x15,
+    RT_NVME_SC_SGL_INVALID_OFFSET               = 0x16,
+    RT_NVME_SC_RESERVED                         = 0x17,
+    RT_NVME_SC_HOST_ID_INCONSIST                = 0x18,
+    RT_NVME_SC_KA_TIMEOUT_EXPIRED               = 0x19,
+    RT_NVME_SC_KA_TIMEOUT_INVALID               = 0x1a,
+    RT_NVME_SC_ABORTED_PREEMPT_ABORT            = 0x1b,
+    RT_NVME_SC_SANITIZE_FAILED                  = 0x1c,
+    RT_NVME_SC_SANITIZE_IN_PROGRESS             = 0x1d,
+    RT_NVME_SC_SGL_INVALID_GRANULARITY          = 0x1e,
+    RT_NVME_SC_CMD_NOT_SUP_CMB_QUEUE            = 0x1f,
+    RT_NVME_SC_NS_WRITE_PROTECTED               = 0x20,
+    RT_NVME_SC_CMD_INTERRUPTED                  = 0x21,
+    RT_NVME_SC_TRANSIENT_TR_ERR                 = 0x22,
+    RT_NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY    = 0x24,
+    RT_NVME_SC_INVALID_IO_CMD_SET               = 0x2c,
+
+    RT_NVME_SC_LBA_RANGE                        = 0x80,
+    RT_NVME_SC_CAP_EXCEEDED                     = 0x81,
+    RT_NVME_SC_NS_NOT_READY                     = 0x82,
+    RT_NVME_SC_RESERVATION_CONFLICT             = 0x83,
+    RT_NVME_SC_FORMAT_IN_PROGRESS               = 0x84,
+
+    /*
+     * Command Specific Status:
+     */
+    RT_NVME_SC_CQ_INVALID                       = 0x100,
+    RT_NVME_SC_QID_INVALID                      = 0x101,
+    RT_NVME_SC_QUEUE_SIZE                       = 0x102,
+    RT_NVME_SC_ABORT_LIMIT                      = 0x103,
+    RT_NVME_SC_ABORT_MISSING                    = 0x104,
+    RT_NVME_SC_ASYNC_LIMIT                      = 0x105,
+    RT_NVME_SC_FIRMWARE_SLOT                    = 0x106,
+    RT_NVME_SC_FIRMWARE_IMAGE                   = 0x107,
+    RT_NVME_SC_INVALID_VECTOR                   = 0x108,
+    RT_NVME_SC_INVALID_LOG_PAGE                 = 0x109,
+    RT_NVME_SC_INVALID_FORMAT                   = 0x10a,
+    RT_NVME_SC_FW_NEEDS_CONV_RESET              = 0x10b,
+    RT_NVME_SC_INVALID_QUEUE                    = 0x10c,
+    RT_NVME_SC_FEATURE_NOT_SAVEABLE             = 0x10d,
+    RT_NVME_SC_FEATURE_NOT_CHANGEABLE           = 0x10e,
+    RT_NVME_SC_FEATURE_NOT_PER_NS               = 0x10f,
+    RT_NVME_SC_FW_NEEDS_SUBSYS_RESET            = 0x110,
+    RT_NVME_SC_FW_NEEDS_RESET                   = 0x111,
+    RT_NVME_SC_FW_NEEDS_MAX_TIME                = 0x112,
+    RT_NVME_SC_FW_ACTIVATE_PROHIBITED           = 0x113,
+    RT_NVME_SC_OVERLAPPING_RANGE                = 0x114,
+    RT_NVME_SC_NS_INSUFFICIENT_CAP              = 0x115,
+    RT_NVME_SC_NS_ID_UNAVAILABLE                = 0x116,
+    RT_NVME_SC_NS_ALREADY_ATTACHED              = 0x118,
+    RT_NVME_SC_NS_IS_PRIVATE                    = 0x119,
+    RT_NVME_SC_NS_NOT_ATTACHED                  = 0x11a,
+    RT_NVME_SC_THIN_PROV_NOT_SUPP               = 0x11b,
+    RT_NVME_SC_CTRL_LIST_INVALID                = 0x11c,
+    RT_NVME_SC_SELT_TEST_IN_PROGRESS            = 0x11d,
+    RT_NVME_SC_BP_WRITE_PROHIBITED              = 0x11e,
+    RT_NVME_SC_CTRL_ID_INVALID                  = 0x11f,
+    RT_NVME_SC_SEC_CTRL_STATE_INVALID           = 0x120,
+    RT_NVME_SC_CTRL_RES_NUM_INVALID             = 0x121,
+    RT_NVME_SC_RES_ID_INVALID                   = 0x122,
+    RT_NVME_SC_PMR_SAN_PROHIBITED               = 0x123,
+    RT_NVME_SC_ANA_GROUP_ID_INVALID             = 0x124,
+    RT_NVME_SC_ANA_ATTACH_FAILED                = 0x125,
+
+    /*
+     * I/O Command Set Specific - NVM commands:
+     */
+    RT_NVME_SC_BAD_ATTRIBUTES                   = 0x180,
+    RT_NVME_SC_INVALID_PI                       = 0x181,
+    RT_NVME_SC_READ_ONLY                        = 0x182,
+    RT_NVME_SC_ONCS_NOT_SUPPORTED               = 0x183,
+
+    /*
+     * I/O Command Set Specific - Fabrics commands:
+     */
+    RT_NVME_SC_CONNECT_FORMAT                   = 0x180,
+    RT_NVME_SC_CONNECT_CTRL_BUSY                = 0x181,
+    RT_NVME_SC_CONNECT_INVALID_PARAM            = 0x182,
+    RT_NVME_SC_CONNECT_RESTART_DISC             = 0x183,
+    RT_NVME_SC_CONNECT_INVALID_HOST             = 0x184,
+
+    RT_NVME_SC_DISCOVERY_RESTART                = 0x190,
+    RT_NVME_SC_AUTH_REQUIRED                    = 0x191,
+
+    /*
+     * I/O Command Set Specific - Zoned commands:
+     */
+    RT_NVME_SC_ZONE_BOUNDARY_ERROR              = 0x1b8,
+    RT_NVME_SC_ZONE_FULL                        = 0x1b9,
+    RT_NVME_SC_ZONE_READ_ONLY                   = 0x1ba,
+    RT_NVME_SC_ZONE_OFFLINE                     = 0x1bb,
+    RT_NVME_SC_ZONE_INVALID_WRITE               = 0x1bc,
+    RT_NVME_SC_ZONE_TOO_MANY_ACTIVE             = 0x1bd,
+    RT_NVME_SC_ZONE_TOO_MANY_OPEN               = 0x1be,
+    RT_NVME_SC_ZONE_INVALID_TRANSITION          = 0x1bf,
+
+    /*
+     * Media and Data Integrity Errors:
+     */
+    RT_NVME_SC_WRITE_FAULT                      = 0x280,
+    RT_NVME_SC_READ_ERROR                       = 0x281,
+    RT_NVME_SC_GUARD_CHECK                      = 0x282,
+    RT_NVME_SC_APPTAG_CHECK                     = 0x283,
+    RT_NVME_SC_REFTAG_CHECK                     = 0x284,
+    RT_NVME_SC_COMPARE_FAILED                   = 0x285,
+    RT_NVME_SC_ACCESS_DENIED                    = 0x286,
+    RT_NVME_SC_UNWRITTEN_BLOCK                  = 0x287,
+
+    /*
+     * Path-related Errors:
+     */
+    RT_NVME_SC_INTERNAL_PATH_ERROR              = 0x300,
+    RT_NVME_SC_ANA_PERSISTENT_LOSS              = 0x301,
+    RT_NVME_SC_ANA_INACCESSIBLE                 = 0x302,
+    RT_NVME_SC_ANA_TRANSITION                   = 0x303,
+    RT_NVME_SC_CTRL_PATH_ERROR                  = 0x360,
+    RT_NVME_SC_HOST_PATH_ERROR                  = 0x370,
+    RT_NVME_SC_HOST_ABORTED_CMD                 = 0x371,
+
+    RT_NVME_SC_CRD                              = 0x1800,
+    RT_NVME_SC_MORE                             = 0x2000,
+    RT_NVME_SC_DNR                              = 0x4000,
+};
+
+/* Admin commands */
+enum
+{
+    RT_NVME_ADMIN_OPCODE_DELETE_SQ          = 0x00,
+    RT_NVME_ADMIN_OPCODE_CREATE_SQ          = 0x01,
+    RT_NVME_ADMIN_OPCODE_GET_LOG_PAGE       = 0x02,
+    RT_NVME_ADMIN_OPCODE_DELETE_CQ          = 0x04,
+    RT_NVME_ADMIN_OPCODE_CREATE_CQ          = 0x05,
+    RT_NVME_ADMIN_OPCODE_IDENTIFY           = 0x06,
+    RT_NVME_ADMIN_OPCODE_ABORT_CMD          = 0x08,
+    RT_NVME_ADMIN_OPCODE_SET_FEATURES       = 0x09,
+    RT_NVME_ADMIN_OPCODE_GET_FEATURES       = 0x0a,
+    RT_NVME_ADMIN_OPCODE_ASYNC_EVENT        = 0x0c,
+    RT_NVME_ADMIN_OPCODE_NS_MGMT            = 0x0d,
+    RT_NVME_ADMIN_OPCODE_ACTIVATE_FW        = 0x10,
+    RT_NVME_ADMIN_OPCODE_DOWNLOAD_FW        = 0x11,
+    RT_NVME_ADMIN_OPCODE_DEV_SELF_TEST      = 0x14,
+    RT_NVME_ADMIN_OPCODE_NS_ATTACH          = 0x15,
+    RT_NVME_ADMIN_OPCODE_KEEP_ALIVE         = 0x18,
+    RT_NVME_ADMIN_OPCODE_DIRECTIVE_SEND     = 0x19,
+    RT_NVME_ADMIN_OPCODE_DIRECTIVE_RECV     = 0x1a,
+    RT_NVME_ADMIN_OPCODE_VIRTUAL_MGMT       = 0x1c,
+    RT_NVME_ADMIN_OPCODE_NVME_MI_SEND       = 0x1d,
+    RT_NVME_ADMIN_OPCODE_NVME_MI_RECV       = 0x1e,
+    RT_NVME_ADMIN_OPCODE_DBBUF              = 0x7c,
+    RT_NVME_ADMIN_OPCODE_FORMAT_NVM         = 0x80,
+    RT_NVME_ADMIN_OPCODE_SECURITY_SEND      = 0x81,
+    RT_NVME_ADMIN_OPCODE_SECURITY_RECV      = 0x82,
+    RT_NVME_ADMIN_OPCODE_SANITIZE_NVM       = 0x84,
+    RT_NVME_ADMIN_OPCODE_GET_LBA_STATUS     = 0x86,
+    RT_NVME_ADMIN_OPCODE_VENDOR_START       = 0xc0,
+};
+
+/* I/O commands */
+enum
+{
+    RT_NVME_CMD_FLUSH           = 0x00,
+    RT_NVME_CMD_WRITE           = 0x01,
+    RT_NVME_CMD_READ            = 0x02,
+    RT_NVME_CMD_WRITE_UNCOR     = 0x04,
+    RT_NVME_CMD_COMPARE         = 0x05,
+    RT_NVME_CMD_WRITE_ZEROES    = 0x08,
+    RT_NVME_CMD_DSM             = 0x09,
+    RT_NVME_CMD_VERIFY          = 0x0c,
+    RT_NVME_CMD_RESV_REGISTER   = 0x0d,
+    RT_NVME_CMD_RESV_REPORT     = 0x0e,
+    RT_NVME_CMD_RESV_ACQUIRE    = 0x11,
+    RT_NVME_CMD_RESV_RELEASE    = 0x15,
+    RT_NVME_CMD_ZONE_MGMT_SEND  = 0x79,
+    RT_NVME_CMD_ZONE_MGMT_RECV  = 0x7a,
+    RT_NVME_CMD_ZONE_APPEND     = 0x7d,
+    RT_NVME_CMD_VENDOR_START    = 0x80,
+};
+
+enum
+{
+    RT_NVME_PSDT_PRP                    = 0x0,
+    RT_NVME_PSDT_SGL_MPTR_CONTIGUOUS    = 0x1,
+    RT_NVME_PSDT_SGL_MPTR_SGL           = 0x2,
+};
+
+/* Commands flags */
+enum
+{
+    RT_NVME_CMD_FLAGS_FUSE_SHIFT        = 0x00,
+    RT_NVME_CMD_FLAGS_PSDT_SHIFT        = 0x06,
+};
+
+struct rt_nvme_command_common
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    rt_le32_t   nsid;
+    rt_le32_t   cmd_dw2[2];
+    rt_le64_t   metadata;
+    rt_le64_t   prp1;
+    rt_le64_t   prp2;
+    rt_le32_t   cmd_dw10[6];
+};
+
+rt_packed(struct rt_nvme_sgl_desc
+{
+    rt_le64_t adddress;
+    rt_le32_t length;
+    rt_uint8_t reserved[3];
+#define SGL_DESC_TYPE_DATA_BLOCK        0x0
+#define SGL_DESC_TYPE_BIT_BUCKET        0x1
+#define SGL_DESC_TYPE_SEGMENT           0x2
+#define SGL_DESC_TYPE_LAST_SEGMENT      0x3
+#define SGL_DESC_TYPE_KEYED_DATA_BLOCK  0x4
+#define SGL_DESC_TYPE_VENDOR_SPECIFIC   0xf
+    rt_uint8_t sgl_identify;
+});
+
+struct rt_nvme_command_rw
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    rt_le32_t   nsid;
+    NVME_RSVD(8, 8);
+    rt_le64_t   metadata;
+    union
+    {
+        struct
+        {
+            rt_le64_t prp1;
+            rt_le64_t prp2;
+        };
+        struct rt_nvme_sgl_desc sgl;
+    };
+    rt_le64_t   slba;
+    rt_le16_t   length;
+    rt_le16_t   control;
+    rt_le32_t   dsmgmt;
+    rt_le32_t   reftag;
+    rt_le16_t   apptag;
+    rt_le16_t   appmask;
+};
+
+enum
+{
+    RT_NVME_RW_LR                   = 1 << 15,
+    RT_NVME_RW_FUA                  = 1 << 14,
+    RT_NVME_RW_APPEND_PIREMAP       = 1 << 9,
+    RT_NVME_RW_DSM_FREQ_UNSPEC      = 0,
+    RT_NVME_RW_DSM_FREQ_TYPICAL     = 1,
+    RT_NVME_RW_DSM_FREQ_RARE        = 2,
+    RT_NVME_RW_DSM_FREQ_READS       = 3,
+    RT_NVME_RW_DSM_FREQ_WRITES      = 4,
+    RT_NVME_RW_DSM_FREQ_RW          = 5,
+    RT_NVME_RW_DSM_FREQ_ONCE        = 6,
+    RT_NVME_RW_DSM_FREQ_PREFETCH    = 7,
+    RT_NVME_RW_DSM_FREQ_TEMP        = 8,
+    RT_NVME_RW_DSM_LATENCY_NONE     = 0 << 4,
+    RT_NVME_RW_DSM_LATENCY_IDLE     = 1 << 4,
+    RT_NVME_RW_DSM_LATENCY_NORM     = 2 << 4,
+    RT_NVME_RW_DSM_LATENCY_LOW      = 3 << 4,
+    RT_NVME_RW_DSM_SEQ_REQ          = 1 << 6,
+    RT_NVME_RW_DSM_COMPRESSED       = 1 << 7,
+    RT_NVME_RW_PRINFO_PRCHK_REF     = 1 << 10,
+    RT_NVME_RW_PRINFO_PRCHK_APP     = 1 << 11,
+    RT_NVME_RW_PRINFO_PRCHK_GUARD   = 1 << 12,
+    RT_NVME_RW_PRINFO_PRACT         = 1 << 13,
+    RT_NVME_RW_DTYPE_STREAMS        = 1 << 4,
+    RT_NVME_WZ_DEAC                 = 1 << 9,
+};
+
+enum
+{
+    RT_NVME_QUEUE_PHYS_CONTIG   = (1 << 0),
+    RT_NVME_CQ_IRQ_ENABLED      = (1 << 1),
+    RT_NVME_SQ_PRIO_URGENT      = (0 << 1),
+    RT_NVME_SQ_PRIO_HIGH        = (1 << 1),
+    RT_NVME_SQ_PRIO_MEDIUM      = (2 << 1),
+    RT_NVME_SQ_PRIO_LOW         = (3 << 1),
+    RT_NVME_FEAT_ARBITRATION    = 0x01,
+    RT_NVME_FEAT_POWER_MGMT     = 0x02,
+    RT_NVME_FEAT_LBA_RANGE      = 0x03,
+    RT_NVME_FEAT_TEMP_THRESH    = 0x04,
+    RT_NVME_FEAT_ERR_RECOVERY   = 0x05,
+    RT_NVME_FEAT_VOLATILE_WC    = 0x06,
+    RT_NVME_FEAT_NUM_QUEUES     = 0x07,
+    RT_NVME_FEAT_IRQ_COALESCE   = 0x08,
+    RT_NVME_FEAT_IRQ_CONFIG     = 0x09,
+    RT_NVME_FEAT_WRITE_ATOMIC   = 0x0a,
+    RT_NVME_FEAT_ASYNC_EVENT    = 0x0b,
+    RT_NVME_FEAT_AUTO_PST       = 0x0c,
+    RT_NVME_FEAT_SW_PROGRESS    = 0x80,
+    RT_NVME_FEAT_HOST_ID        = 0x81,
+    RT_NVME_FEAT_RESV_MASK      = 0x82,
+    RT_NVME_FEAT_RESV_PERSIST   = 0x83,
+    RT_NVME_LOG_ERROR           = 0x01,
+    RT_NVME_LOG_SMART           = 0x02,
+    RT_NVME_LOG_FW_SLOT         = 0x03,
+    RT_NVME_LOG_RESERVATION     = 0x80,
+    RT_NVME_FWACT_REPL          = (0 << 3),
+    RT_NVME_FWACT_REPL_ACTV     = (1 << 3),
+    RT_NVME_FWACT_ACTV          = (2 << 3),
+};
+
+struct rt_nvme_command_identify
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    rt_le32_t   nsid;
+    NVME_RSVD(8, 16);
+    rt_le64_t   prp1;
+    rt_le64_t   prp2;
+    rt_le32_t   cns;
+    NVME_RSVD(64, 20);
+};
+
+struct rt_nvme_command_features
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    rt_le32_t   nsid;
+    NVME_RSVD(8, 16);
+    rt_le64_t   prp1;
+    rt_le64_t   prp2;
+    rt_le32_t   fid;
+    rt_le32_t   dword11;
+    NVME_RSVD(68, 16);
+};
+
+struct rt_nvme_command_create_cq
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    NVME_RSVD(4, 20);
+    rt_le64_t   prp1;
+    NVME_RSVD(32, 8);
+    rt_le16_t   cqid;
+    rt_le16_t   qsize;
+    rt_le16_t   cq_flags;
+    rt_le16_t   irq_vector;
+    NVME_RSVD(104, 16);
+};
+
+struct rt_nvme_command_create_sq
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    NVME_RSVD(4, 20);
+    rt_le64_t   prp1;
+    NVME_RSVD(32, 8);
+    rt_le16_t   sqid;
+    rt_le16_t   qsize;
+    rt_le16_t   sq_flags;
+    rt_le16_t   cqid;
+    NVME_RSVD(104, 16);
+};
+
+struct rt_nvme_command_delete_queue
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    NVME_RSVD(4, 36);
+    rt_le16_t   qid;
+    NVME_RSVD(42, 22);
+};
+
+struct rt_nvme_command_write_zeroes
+{
+    rt_uint8_t  opcode;
+    rt_uint8_t  flags;
+    rt_uint16_t cmdid;
+    rt_le32_t   nsid;
+    NVME_RSVD(8, 8);
+    rt_le64_t   metadata;
+    rt_le64_t   prp1;
+    rt_le64_t   prp2;
+    rt_le64_t   slba;
+    rt_le16_t   length;
+    rt_le16_t   control;
+    rt_le32_t   dsmgmt;
+    rt_le32_t   reftag;
+    rt_le16_t   apptag;
+    rt_le16_t   appmask;
+};
+
+struct rt_nvme_command
+{
+    union
+    {
+        struct rt_nvme_command_common common;
+        struct rt_nvme_command_rw rw;
+        struct rt_nvme_command_identify identify;
+        struct rt_nvme_command_features features;
+        struct rt_nvme_command_create_cq create_cq;
+        struct rt_nvme_command_create_sq create_sq;
+        struct rt_nvme_command_delete_queue delete_queue;
+        struct rt_nvme_command_write_zeroes write_zeroes;
+    };
+};
+
+struct rt_nvme_completion
+{
+    union
+    {
+        rt_le16_t  u16;
+        rt_le32_t  u32;
+        rt_le64_t  u64;
+    } result;
+    rt_le16_t   sq_head;    /* How much of this queue may be reclaimed */
+    rt_le16_t   sq_id;      /* Submission queue that generated this entry */
+    rt_uint16_t cmdid;      /* Which command completed */
+    rt_le16_t   status;     /* Command status */
+};
+
+enum
+{
+    RT_NVME_REG_CAP         = 0x0000,   /* Controller Capabilities */
+    RT_NVME_REG_VS          = 0x0008,   /* Version */
+    RT_NVME_REG_INTMS       = 0x000c,   /* Interrupt Mask Set */
+    RT_NVME_REG_INTMC       = 0x0010,   /* Interrupt Mask Clear */
+    RT_NVME_REG_CC          = 0x0014,   /* Controller Configuration */
+    RT_NVME_REG_CSTS        = 0x001c,   /* Controller Status */
+    RT_NVME_REG_NSSR        = 0x0020,   /* NVM Subsystem Reset */
+    RT_NVME_REG_AQA         = 0x0024,   /* Admin Queue Attributes */
+    RT_NVME_REG_ASQ         = 0x0028,   /* Admin SQ Base Address */
+    RT_NVME_REG_ACQ         = 0x0030,   /* Admin CQ Base Address */
+    RT_NVME_REG_CMBLOC      = 0x0038,   /* Controller Memory Buffer Location */
+    RT_NVME_REG_CMBSZ       = 0x003c,   /* Controller Memory Buffer Size */
+    RT_NVME_REG_BPINFO      = 0x0040,   /* Boot Partition Information */
+    RT_NVME_REG_BPRSEL      = 0x0044,   /* Boot Partition Read Select */
+    RT_NVME_REG_BPMBL       = 0x0048,   /* Boot Partition Memory Buffer Location */
+    RT_NVME_REG_CMBMSC      = 0x0050,   /* Controller Memory Buffer Memory Space Control */
+    RT_NVME_REG_CRTO        = 0x0068,   /* Controller Ready Timeouts */
+    RT_NVME_REG_PMRCAP      = 0x0e00,   /* Persistent Memory Capabilities */
+    RT_NVME_REG_PMRCTL      = 0x0e04,   /* Persistent Memory Region Control */
+    RT_NVME_REG_PMRSTS      = 0x0e08,   /* Persistent Memory Region Status */
+    RT_NVME_REG_PMREBS      = 0x0e0c,   /* Persistent Memory Region Elasticity Buffer Size */
+    RT_NVME_REG_PMRSWTP     = 0x0e10,   /* Persistent Memory Region Sustained Write Throughput */
+    RT_NVME_REG_DBS         = 0x1000,   /* SQ 0 Tail Doorbell */
+};
+
+#define RT_NVME_CAP_MQES(cap)       ((cap) & 0xffff)
+#define RT_NVME_CAP_TIMEOUT(cap)    (((cap) >> 24) & 0xff)
+#define RT_NVME_CAP_STRIDE(cap)     (((cap) >> 32) & 0xf)
+#define RT_NVME_CAP_MPSMIN(cap)     (((cap) >> 48) & 0xf)
+#define RT_NVME_CAP_MPSMAX(cap)     (((cap) >> 52) & 0xf)
+
+#define RT_NVME_VS(major, minor)    (((major) << 16) | ((minor) << 8))
+
+#define RT_NVME_AQ_DEPTH            32
+#define RT_NVME_NR_AEN_COMMANDS     1
+#define RT_NVME_AQ_BLK_MQ_DEPTH     (RT_NVME_AQ_DEPTH - RT_NVME_NR_AEN_COMMANDS)
+#define RT_NVME_AQ_MQ_TAG_DEPTH     (RT_NVME_AQ_BLK_MQ_DEPTH - 1)
+
+enum
+{
+    RT_NVME_CC_ENABLE           = 1 << 0,
+    RT_NVME_CC_CSS_NVM          = 0 << 4,
+    RT_NVME_CC_MPS_SHIFT        = 7,
+    RT_NVME_CC_ARB_RR           = 0 << 11,
+    RT_NVME_CC_ARB_WRRU         = 1 << 11,
+    RT_NVME_CC_ARB_VS           = 7 << 11,
+    RT_NVME_CC_SHN_NONE         = 0 << 14,
+    RT_NVME_CC_SHN_NORMAL       = 1 << 14,
+    RT_NVME_CC_SHN_ABRUPT       = 2 << 14,
+    RT_NVME_CC_SHN_MASK         = 3 << 14,
+    RT_NVME_CC_IOSQES           = 6 << 16,
+    RT_NVME_CC_IOCQES           = 4 << 20,
+    RT_NVME_CSTS_RDY            = 1 << 0,
+    RT_NVME_CSTS_CFS            = 1 << 1,
+    RT_NVME_CSTS_SHST_NORMAL    = 0 << 2,
+    RT_NVME_CSTS_SHST_OCCUR     = 1 << 2,
+    RT_NVME_CSTS_SHST_CMPLT     = 2 << 2,
+    RT_NVME_CSTS_SHST_MASK      = 3 << 2,
+};
+
+rt_packed(struct rt_nvme_id_power_state
+{
+    rt_le16_t   mp;         /* Maximum Power */
+    NVME_RSVD(1, 1);
+    rt_uint8_t  mxps_nops;  /* Max Power Scale, Non-Operational State */
+    rt_le32_t   enlat;      /* Entry Latency: microseconds */
+    rt_le32_t   exlat;      /* Exit Latency: microseconds */
+    rt_uint8_t  rrt;        /* Relative Read Throughput */
+    rt_uint8_t  rrl;        /* Relative Read Latency */
+    rt_uint8_t  rwt;        /* Relative Write Throughput */
+    rt_uint8_t  rwl;        /* Relative Write Latency */
+    rt_le16_t   idlp;       /* Idle Power */
+    rt_uint8_t  ips;        /* Idle Power Scale */
+    NVME_RSVD(19, 1);
+    rt_le16_t   actp;       /* Active Power */
+    rt_uint8_t  apw_aps;    /* Active Power Workload, Active Power Scale */
+    NVME_RSVD(23, 9);
+});
+
+rt_packed(struct rt_nvme_id_ctrl
+{
+    /* Controller Capabilities and Features */
+    rt_le16_t       vid;            /* PCI Vendor ID */
+    rt_le16_t       ssvid;          /* PCI Subsystem Vendor */
+    char            sn[20];         /* Serial Number */
+    char            mn[40];         /* Model Number */
+    char            fr[8];          /* Firmware Revision */
+    rt_uint8_t      rab;            /* Recommended Arbitration Burst */
+    rt_uint8_t      ieee[3];        /* IEEE OUI Identifier */
+    rt_uint8_t      mic;            /* Controller Multi-Path I/O and Namespace Sharing Capabilities */
+    rt_uint8_t      mdts;           /* Maximum Data Transfer Size */
+    rt_uint16_t     cntlid;         /* Controller ID */
+    rt_uint32_t     ver;            /* Version */
+    rt_uint32_t     rtd3r;          /* RTD3 Resume Latency */
+    rt_uint32_t     rtd3e;          /* RTD3 Entry Latency */
+    rt_uint32_t     oaes;           /* Optional Asynchronous Events Supported */
+#define RT_NVME_ID_CTRATT_ELBAS     15  /* Extended LBA Formats Supported */
+#define RT_NVME_ID_CTRATT_DNVMS     14  /* Delete NVM Set */
+#define RT_NVME_ID_CTRATT_DEG       13  /* Delete Endurance Group */
+#define RT_NVME_ID_CTRATT_VCM       12  /* Variable Capacity Management */
+#define RT_NVME_ID_CTRATT_FCM       11  /* Fixed Capacity Management */
+#define RT_NVME_ID_CTRATT_MDS       10  /* Multi-Domain Subsystem */
+#define RT_NVME_ID_CTRATT_UUIDL     9   /* UUID List */
+#define RT_NVME_ID_CTRATT_SQA       8   /* SQ Associations */
+#define RT_NVME_ID_CTRATT_NG        7   /* Namespace Granularity */
+#define RT_NVME_ID_CTRATT_TBKAS     6   /* Traffic Based Keep Alive Support */
+#define RT_NVME_ID_CTRATT_PLM       5   /* Predictable Latency Mode */
+#define RT_NVME_ID_CTRATT_EG        4   /* Endurance Groups */
+#define RT_NVME_ID_CTRATT_RRL       3   /* Read Recovery Levels */
+#define RT_NVME_ID_CTRATT_NVMS      2   /* NVM Sets */
+#define RT_NVME_ID_CTRATT_NOPSPM    1   /* Non-Operational Power State Permissive Mode */
+#define RT_NVME_ID_CTRATT_HIS       0   /* Host Identifier Support */
+    rt_uint32_t     ctratt;         /* Controller Attributes */
+    rt_uint16_t     rrls;           /* Read Recovery Levels Supported */
+    NVME_RSVD(102, 9);
+    rt_uint8_t      cntrltype;      /* Controller Type */
+    rt_uint8_t      fguid[16];      /* FRU Globally Unique Identifier */
+    rt_uint16_t     crdt1;          /* Command Retry Delay Time 1 */
+    rt_uint16_t     crdt2;          /* Command Retry Delay Time 2 */
+    rt_uint16_t     crdt3;          /* Command Retry Delay Time 3 */
+    NVME_RSVD(134, 119);
+#define RT_NVME_ID_NVMSR_NVMEE      1   /* NVMe Enclosure */
+#define RT_NVME_ID_NVMSR_NVMESD     0   /* NVMe Storage Device */
+    rt_uint8_t      nvmsr;          /* NVM Subsystem Report */
+
+#define RT_NVME_ID_VWCI_VWCRV       7   /* VPD Write Cycles Remaining Valid */
+#define RT_NVME_ID_VWCI_VWCR        0   /* VPD Write Cycles Remaining */
+    rt_uint8_t      vwci;           /* VPD Write Cycle Information */
+#define RT_NVME_ID_MEC_PCIEME       1   /* PCIe Port Management Endpoint */
+#define RT_NVME_ID_MEC_SMBUSME      0   /* SMBus/I2C Port Management Endpoint */
+    rt_uint8_t      mec;            /* Management Endpoint Capabilities  */
+
+    /* Admin Command Set Attributes & Optional Controller Capabilities */
+    rt_le16_t       oacs;           /* Optional Admin Command Support */
+    rt_uint8_t      acl;            /* Abort Command Limit */
+    rt_uint8_t      aerl;           /* Asynchronous Event Request Limit */
+#define RT_NVME_ID_FRMW_SMUD        5   /* Support Multiple Update Detection */
+#define RT_NVME_ID_FRMW_FAWR        4   /* Firmware Activation Without Reset */
+#define RT_NVME_ID_FRMW_NOFS        1   /* Number Of Firmware Slots */
+#define RT_NVME_ID_FRMW_FFSRO       0   /* First Firmware Slot Read Only */
+    rt_uint8_t      frmw;           /* Firmware Updates */
+    rt_uint8_t      lpa;            /* Log Page Attributes */
+    rt_uint8_t      elpe;           /* Error Log Page Entries */
+    rt_uint8_t      npss;           /* Number of Power States Support */
+    rt_uint8_t      avscc;          /* Admin Vendor Specific Command Configuration */
+    rt_uint8_t      apsta;          /* Autonomous Power State Transition Attributes */
+    rt_le16_t       wctemp;         /* Warning Composite Temperature Threshold */
+    rt_le16_t       cctemp;         /* Critical Composite Temperature Threshold */
+    rt_uint16_t     mtfa;           /* Maximum Time for Firmware Activation */
+    rt_uint32_t     hmpre;          /* Host Memory Buffer Preferred Size */
+    rt_uint32_t     hmmin;          /* Host Memory Buffer Minimum Size */
+    rt_uint8_t      tnvmcap[16];    /* Total NVM Capacity */
+    rt_uint8_t      unvmcap[16];    /* Unallocated NVM Capacity */
+#define RT_NVME_ID_RPMBS_ASZ        24  /* Access Size */
+#define RT_NVME_ID_RPMBS_TSZ        16  /* Total Size */
+#define RT_NVME_ID_RPMBS_AM         3   /* Authentication Method */
+#define RT_NVME_ID_RPMBS_NORPMBU    2   /* Number of RPMB Units */
+    rt_uint32_t     rpmbs;          /* Replay Protected Memory Block Support */
+    rt_uint16_t     edstt;          /* Extended Device Self-test Time */
+    rt_uint8_t      dsto;           /* Device Self-test Options */
+    rt_uint8_t      fwug;           /* Firmware Update Granularity */
+    rt_uint16_t     kas;            /* Keep Alive Support */
+    rt_uint16_t     hctma;          /* Host Controlled Thermal Management Attributes */
+    rt_uint16_t     mntmt;          /* Minimum Thermal Management Temperature */
+    rt_uint16_t     mxtmt;          /* Maximum Thermal Management Temperature */
+#define RT_NVME_ID_SANICAP_NODMMAS  30  /* No-Deallocate Modifies Media After Sanitize */
+#define RT_NVME_ID_SANICAP_NDI      29  /* No-Deallocate Inhibited */
+#define RT_NVME_ID_SANICAP_OWS      2   /* Overwrite Support */
+#define RT_NVME_ID_SANICAP_BES      1   /* Block Erase Support */
+#define RT_NVME_ID_SANICAP_CES      0   /* Crypto Erase Support */
+    rt_uint32_t     sanicap;        /* Sanitize Capabilities */
+    rt_uint32_t     hmminds;        /* Host Memory Buffer Minimum Descriptor Entry Size */
+    rt_uint16_t     hmmaxd;         /* Host Memory Maximum Descriptors Entries */
+    rt_uint16_t     nsetidmax;      /* NVM Set Identifier Maximum */
+    rt_uint16_t     endgidmax;      /* Endurance Group Identifier Maximum */
+    rt_uint8_t      anatt;          /* ANA Transition Time */
+    rt_uint8_t      anacap;         /* Asymmetric Namespace Access Capabilities */
+    rt_uint32_t     anagrpmax;      /* ANA Group Identifier Maximum */
+    rt_uint32_t     nanagrpid;      /* Number of ANA Group Identifiers */
+    rt_uint32_t     pels;           /* Persistent Event Log Size */
+    rt_uint16_t     dmid;           /* Domain Identifier */
+    NVME_RSVD(358, 10);
+    rt_uint8_t      megcap[16];     /* Max Endurance Group Capacity */
+    NVME_RSVD(384, 128);
+
+    /* NVM Command Set Attributes */
+    rt_uint8_t      sqes;           /* Submission Queue Entry Size */
+    rt_uint8_t      cqes;           /* Completion Queue Entry Size */
+    rt_le16_t       maxcmd;         /* Maximum Outstanding Commands */
+    rt_le32_t       nn;             /* Number of Namespaces */
+    rt_le16_t       oncs;           /* Optional NVM Command Support */
+    rt_le16_t       fuses;          /* Fused Operation Support */
+    rt_uint8_t      fna;            /* Format NVM Attributes */
+    rt_uint8_t      vwc;            /* Volatile Write Cache */
+    rt_le16_t       awun;           /* Atomic Write Unit Normal */
+    rt_le16_t       awupf;          /* Atomic Write Unit Power Fail */
+    rt_uint8_t      nvscc;          /* I/O Command Set Vendor Specific Command Configuration */
+    rt_uint8_t      nwpc;           /* Namespace Write Protection Capabilities */
+    rt_le16_t       acwu;           /* Atomic Compare & Write Unit */
+    rt_le16_t       cdfs;           /* Copy Descriptor Formats Supported */
+#define RT_NVME_ID_SGL_SUPPORT_MASK 0x3
+    rt_le32_t       sgls;           /* SGL Support */
+    rt_uint32_t     mnan;           /* Maximum Number of Allowed Namespaces */
+    char            maxdna[16];     /* Maximum Domain Namespace Attachments */
+    rt_le32_t       maxcna;         /* Maximum I/O Controller Namespace Attachments */
+    NVME_RSVD(564, 204);
+    rt_uint8_t      subnqn[256];    /* NVM Subsystem NVMe Qualified Name */
+    NVME_RSVD(1024, 768);
+    rt_le32_t       ioccsz;         /* I/O Queue Command Capsule Supported Size */
+    rt_le32_t       iorcsz;         /* I/O Queue Response Capsule Supported Size */
+    rt_le16_t       icdoff;         /* In Capsule Data Offset */
+    rt_uint8_t      ctrattr;        /* Fabrics Controller Attributes */
+    rt_uint8_t      msdbd;          /* Maximum SGL Data Block Descriptors */
+    rt_le16_t       ofcs;           /* Optional Fabric Commands Support */
+    rt_uint8_t      dctype;
+    NVME_RSVD(1807, 241);
+
+    /* Power State Descriptors */
+    struct rt_nvme_id_power_state psd[32];
+
+    /* Vendor Specific */
+    rt_uint8_t      vs[1024];
+});
+
+enum
+{
+    RT_NVME_CTRL_CMIC_MULTI_PORT                = 1 << 0,
+    RT_NVME_CTRL_CMIC_MULTI_CTRL                = 1 << 1,
+    RT_NVME_CTRL_CMIC_ANA                       = 1 << 3,
+    RT_NVME_CTRL_ONCS_COMPARE                   = 1 << 0,
+    RT_NVME_CTRL_ONCS_WRITE_UNCORRECTABLE       = 1 << 1,
+    RT_NVME_CTRL_ONCS_DSM                       = 1 << 2,
+    RT_NVME_CTRL_ONCS_WRITE_ZEROES              = 1 << 3,
+    RT_NVME_CTRL_ONCS_RESERVATIONS              = 1 << 5,
+    RT_NVME_CTRL_ONCS_TIMESTAMP                 = 1 << 6,
+    RT_NVME_CTRL_VWC_PRESENT                    = 1 << 0,
+    RT_NVME_CTRL_OACS_SEC_SUPP                  = 1 << 0,
+    RT_NVME_CTRL_OACS_NS_MNGT_SUPP              = 1 << 3,
+    RT_NVME_CTRL_OACS_DIRECTIVES                = 1 << 5,
+    RT_NVME_CTRL_OACS_DBBUF_SUPP                = 1 << 8,
+    RT_NVME_CTRL_LPA_CMD_EFFECTS_LOG            = 1 << 1,
+    RT_NVME_CTRL_CTRATT_128_ID                  = 1 << 0,
+    RT_NVME_CTRL_CTRATT_NON_OP_PSP              = 1 << 1,
+    RT_NVME_CTRL_CTRATT_NVM_SETS                = 1 << 2,
+    RT_NVME_CTRL_CTRATT_READ_RECV_LVLS          = 1 << 3,
+    RT_NVME_CTRL_CTRATT_ENDURANCE_GROUPS        = 1 << 4,
+    RT_NVME_CTRL_CTRATT_PREDICTABLE_LAT         = 1 << 5,
+    RT_NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY   = 1 << 7,
+    RT_NVME_CTRL_CTRATT_UUID_LIST               = 1 << 9,
+};
+
+struct rt_nvme_lba_format
+{
+    rt_le16_t   ms;         /* Metadata size */
+    rt_uint8_t  ds;         /* Data size */
+    rt_uint8_t  rp;         /* Relative performance */
+};
+
+rt_packed(struct rt_nvme_id_ns
+{
+    rt_le64_t   nsze;       /* Namespace size */
+    rt_le64_t   ncap;       /* Namespace capacity */
+    rt_le64_t   nuse;       /* Namespace utilization */
+    rt_uint8_t  nsfeat;     /* Namespace features */
+    rt_uint8_t  nlbaf;      /* Number of lba formats */
+    rt_uint8_t  flbas;      /* Formatted lba size */
+    rt_uint8_t  mc;         /* Metadata capabilities */
+    rt_uint8_t  dpc;        /* End-to-end data protection capabilities */
+    rt_uint8_t  dps;        /* End-to-end data protection type settings */
+    rt_uint8_t  nmic;       /* Namespace Multi-path I/O and Namespace Sharing Capabilities */
+    rt_uint8_t  rescap;     /* Reservation Capabilities */
+    rt_uint8_t  fpi;        /* Format Progress Indicator */
+    rt_uint8_t  dlfeat;     /* Deallocate Logical Block Features */
+    rt_le16_t   nawun;      /* Namespace Atomic Write Unit Normal  */
+    rt_le16_t   nawupf;     /* Namespace Atomic Write Unit Power Fail */
+    rt_le16_t   nacwu;      /* Namespace Atomic Compare & Write Unit */
+    rt_le16_t   nabsn;      /* Namespace Atomic Boundary Size Normal */
+    rt_le16_t   nabo;       /* Namespace Atomic Boundary Offset */
+    rt_le16_t   nabspf;     /* Namespace Atomic Boundary Size Power Fail */
+    rt_uint16_t noiob;      /* Namespace Optimal IO Boundary */
+    rt_le64_t   nvmcap[2];  /* NVMe Capacity */
+    rt_uint16_t npwg;       /* Namespace Preferred Write Granularity  */
+    rt_uint16_t npwa;       /* Namespace Preferred Write Alignment */
+    rt_uint16_t npdg;       /* Namespace Preferred Deallocate Granularity */
+    rt_uint16_t npda;       /* Namespace Preferred Deallocate Alignment */
+    rt_uint16_t nows;       /* Namespace Optimal Write Size */
+    NVME_RSVD(118, 18);
+    rt_uint32_t anagrpid;   /* ANA Group Identifier */
+    NVME_RSVD(139, 3);
+    rt_uint8_t  nsattr;     /* Namespace Attributes */
+    rt_uint16_t nvmsetid;   /* NVMe Set Identifier */
+    rt_uint16_t endgid;     /* Endurance Group Identifier */
+    rt_uint8_t  nguid[16];  /* Namespace Globally Unique Identifier */
+    rt_uint8_t  eui64[8];   /* IEEE Extended Unique Identifier */
+
+    /* Logical Block Address Format */
+    struct rt_nvme_lba_format lbaf[16];
+    NVME_RSVD(171, 192);
+
+    /* Vendor specific */
+    rt_uint8_t  vs[3712];
+});
+
+enum
+{
+    RT_NVME_NS_FEAT_THIN        = 1 << 0,
+    RT_NVME_NS_FLBAS_LBA_MASK   = 0xf,
+    RT_NVME_NS_FLBAS_LBA_UMASK  = 0x60,
+    RT_NVME_NS_FLBAS_LBA_SHIFT  = 1,
+    RT_NVME_NS_FLBAS_META_EXT   = 0x10,
+    RT_NVME_LBAF_RP_BEST        = 0,
+    RT_NVME_LBAF_RP_BETTER      = 1,
+    RT_NVME_LBAF_RP_GOOD        = 2,
+    RT_NVME_LBAF_RP_DEGRADED    = 3,
+    RT_NVME_NS_DPC_PI_LAST      = 1 << 4,
+    RT_NVME_NS_DPC_PI_FIRST     = 1 << 3,
+    RT_NVME_NS_DPC_PI_TYPE3     = 1 << 2,
+    RT_NVME_NS_DPC_PI_TYPE2     = 1 << 1,
+    RT_NVME_NS_DPC_PI_TYPE1     = 1 << 0,
+    RT_NVME_NS_DPS_PI_FIRST     = 1 << 3,
+    RT_NVME_NS_DPS_PI_MASK      = 0x7,
+    RT_NVME_NS_DPS_PI_TYPE1     = 1,
+    RT_NVME_NS_DPS_PI_TYPE2     = 2,
+    RT_NVME_NS_DPS_PI_TYPE3     = 3,
+};
+
+struct rt_nvme_ops;
+struct rt_nvme_controller;
+
+/*
+ * An NVM Express queue. Each device has at least two (one for admin commands
+ * and one for I/O commands).
+ */
+struct rt_nvme_queue
+{
+    struct rt_nvme_controller *nvme;
+    struct rt_nvme_command *sq_cmds;
+    struct rt_nvme_completion *cq_entry;
+
+    rt_ubase_t sq_cmds_phy;
+    rt_ubase_t cq_entry_phy;
+
+    rt_uint32_t *doorbell;
+    rt_uint16_t qid;
+    rt_uint16_t depth;
+    rt_uint16_t sq_head;
+    rt_uint16_t sq_tail;
+    rt_uint16_t cq_head;
+    rt_uint16_t cq_phase;
+
+    rt_err_t err;
+    struct rt_nvme_command *cmd;
+
+    struct rt_completion done;
+    struct rt_spinlock lock;
+};
+
+struct rt_nvme_controller
+{
+    rt_list_t list;
+    struct rt_device *dev;
+
+    int nvme_id;
+    char name[RT_NAME_MAX];
+
+    void *regs;
+    rt_uint64_t cap;
+    rt_uint32_t page_shift;
+    rt_uint32_t page_size;
+    rt_uint32_t queue_depth;
+    rt_uint32_t io_queue_max;
+    rt_uint32_t ctrl_config;
+    rt_uint32_t max_transfer_shift:8;
+    rt_uint32_t volatile_write_cache:8;
+    rt_uint32_t write_zeroes:1;
+    rt_uint32_t sgl_mode:2;
+    rt_uint32_t doorbell_stride;
+    rt_uint32_t *doorbell_tbl;
+
+    const struct rt_nvme_ops *ops;
+
+#define RT_USING_NVME_QUEUE (1 + (RT_USING_NVME_IO_QUEUE * RT_CPUS_NR))
+    int irqs_nr;
+    int irqs[RT_USING_NVME_QUEUE];
+    union
+    {
+        struct
+        {
+            struct rt_nvme_queue admin_queue;
+            struct rt_nvme_queue io_queues[RT_USING_NVME_IO_QUEUE * RT_CPUS_NR];
+        };
+        struct rt_nvme_queue queue[RT_USING_NVME_QUEUE];
+    };
+
+    volatile rt_atomic_t cmdid;
+    volatile rt_atomic_t ioqid[RT_CPUS_NR];
+
+    rt_list_t ns_nodes;
+};
+
+struct rt_nvme_device
+{
+    struct rt_blk_disk parent;
+    struct rt_nvme_controller *ctrl;
+
+    rt_list_t list;
+
+    rt_uint32_t nsid;
+    rt_uint32_t lba_shift;
+    struct rt_nvme_id_ns id;
+};
+#define rt_disk_to_nvme_device(disk) rt_container_of(disk, struct rt_nvme_device, parent)
+
+struct rt_nvme_ops
+{
+    const char *name;
+
+    /* Controller-specific NVM Express queue setup */
+    rt_err_t (*setup_queue)(struct rt_nvme_queue *queue);
+    /* Controller-specific NVM Express queue cleanup */
+    rt_err_t (*cleanup_queue)(struct rt_nvme_queue *queue);
+    /* Controller-specific NVM Express command submission */
+    rt_err_t (*submit_cmd)(struct rt_nvme_queue *queue, struct rt_nvme_command *cmd);
+    /* Controller-specific NVM Express command completion */
+    void (*complete_cmd)(struct rt_nvme_queue *queue, struct rt_nvme_command *cmd);
+};
+
+rt_err_t rt_nvme_controller_register(struct rt_nvme_controller *nvme);
+rt_err_t rt_nvme_controller_unregister(struct rt_nvme_controller *nvme);
+
+#endif /* __NVME_H__ */

+ 4 - 0
components/drivers/include/rtdevice.h

@@ -55,6 +55,10 @@ extern "C" {
 
 #include "drivers/iio.h"
 
+#ifdef RT_USING_NVME
+#include "drivers/nvme.h"
+#endif
+
 #ifdef RT_USING_OFW
 #include "drivers/ofw.h"
 #include "drivers/ofw_fdt.h"

+ 23 - 0
components/drivers/nvme/Kconfig

@@ -0,0 +1,23 @@
+menuconfig RT_USING_NVME
+    bool "Using Non-Volatile Memory Express (NVME) device drivers"
+    depends on RT_USING_DM
+    depends on RT_USING_BLK
+    depends on RT_USING_DMA
+    default n
+
+config RT_USING_NVME_IO_QUEUE
+    int "Number of I/O Command queue"
+    depends on RT_USING_NVME
+    default 2 if RT_THREAD_PRIORITY_8
+    default 4 if RT_THREAD_PRIORITY_32
+    default 8 if RT_THREAD_PRIORITY_256
+
+config RT_NVME_PCI
+    bool "NVME support on PCI bus"
+    depends on RT_USING_NVME
+    depends on RT_USING_PCI
+    default y
+
+if RT_USING_NVME
+    osource "$(SOC_DM_NVME_DIR)/Kconfig"
+endif

+ 18 - 0
components/drivers/nvme/SConscript

@@ -0,0 +1,18 @@
+from building import *
+
+group = []
+
+if not GetDepend(['RT_USING_NVME']):
+    Return('group')
+
+cwd = GetCurrentDir()
+CPPPATH = [cwd + '/../include']
+
+src = ['nvme.c']
+
+if GetDepend(['RT_NVME_PCI']):
+    src += ['nvme-pci.c']
+
+group = DefineGroup('DeviceDrivers', src, depend = [''], CPPPATH = CPPPATH)
+
+Return('group')

+ 171 - 0
components/drivers/nvme/nvme-pci.c

@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2006-2023, RT-Thread Development Team
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2023-02-25     GuEe-GUI     the first version
+ */
+
+#include <rtthread.h>
+#include <rtdevice.h>
+
+#define NVME_REG_BAR 0
+
+struct pci_nvme_quirk
+{
+    const struct rt_nvme_ops *ops;
+};
+
+struct pci_nvme_controller
+{
+    struct rt_nvme_controller parent;
+    const struct pci_nvme_quirk *quirk;
+
+    rt_bool_t is_msi;
+    struct rt_pci_msix_entry msix_entries[RT_USING_NVME_QUEUE];
+};
+
+static const struct rt_nvme_ops pci_nvme_std_ops =
+{
+    .name = "PCI",
+};
+
+static rt_err_t pci_nvme_probe(struct rt_pci_device *pdev)
+{
+    rt_err_t err;
+    rt_ssize_t msi_nr;
+    struct rt_nvme_controller *nvme;
+    struct pci_nvme_controller *pci_nvme = rt_calloc(1, sizeof(*pci_nvme));
+    const struct pci_nvme_quirk *quirk = pdev->id->data;
+
+    if (!pci_nvme)
+    {
+        return -RT_ENOMEM;
+    }
+
+    pci_nvme->quirk = quirk;
+    nvme = &pci_nvme->parent;
+    nvme->dev = &pdev->parent;
+    nvme->regs = rt_pci_iomap(pdev, NVME_REG_BAR);
+
+    if (!nvme->regs)
+    {
+        err = -RT_EIO;
+        goto _fail;
+    }
+
+    nvme->ops = quirk && quirk->ops ? quirk->ops : &pci_nvme_std_ops;
+
+    if ((msi_nr = rt_pci_msix_vector_count(pdev)) <= 0)
+    {
+        msi_nr = rt_pci_msi_vector_count(pdev);
+    }
+    if (msi_nr > 0)
+    {
+        nvme->irqs_nr = RT_ARRAY_SIZE(pci_nvme->msix_entries);
+        nvme->irqs_nr = rt_min_t(rt_size_t, msi_nr, nvme->irqs_nr);
+    }
+
+    if (nvme->irqs_nr > 0)
+    {
+        rt_pci_msix_entry_index_linear(pci_nvme->msix_entries, nvme->irqs_nr);
+
+        if (rt_pci_msix_enable(pdev, pci_nvme->msix_entries, nvme->irqs_nr) > 0)
+        {
+            pci_nvme->is_msi = RT_TRUE;
+
+            for (int i = 0; i < nvme->irqs_nr; ++i)
+            {
+                nvme->irqs[i] = pci_nvme->msix_entries[i].irq;
+            }
+        }
+    }
+
+    if (!pci_nvme->is_msi)
+    {
+        nvme->irqs_nr = 1;
+        nvme->irqs[0] = pdev->irq;
+        rt_pci_irq_unmask(pdev);
+    }
+
+    rt_pci_set_master(pdev);
+
+    if ((err = rt_nvme_controller_register(nvme)))
+    {
+        goto _disable;
+    }
+
+    pdev->parent.user_data = pci_nvme;
+
+    return RT_EOK;
+
+_disable:
+    if (pci_nvme->is_msi)
+    {
+        rt_pci_msix_disable(pdev);
+    }
+    else
+    {
+        rt_pci_irq_mask(pdev);
+    }
+    rt_pci_clear_master(pdev);
+    rt_iounmap(nvme->regs);
+
+_fail:
+    rt_free(pci_nvme);
+
+    return err;
+}
+
+static rt_err_t pci_nvme_remove(struct rt_pci_device *pdev)
+{
+    struct rt_nvme_controller *nvme;
+    struct pci_nvme_controller *pci_nvme = pdev->parent.user_data;
+
+    nvme = &pci_nvme->parent;
+
+    rt_nvme_controller_unregister(nvme);
+
+    if (pci_nvme->is_msi)
+    {
+        rt_pci_msix_disable(pdev);
+    }
+    else
+    {
+        /* INTx is shared, don't mask all */
+        rt_hw_interrupt_umask(pdev->irq);
+        rt_pci_irq_mask(pdev);
+    }
+
+    rt_pci_clear_master(pdev);
+
+    rt_iounmap(nvme->regs);
+    rt_free(pci_nvme);
+
+    return RT_EOK;
+}
+
+static rt_err_t pci_nvme_shutdown(struct rt_pci_device *pdev)
+{
+    return pci_nvme_remove(pdev);
+}
+
+static const struct rt_pci_device_id pci_nvme_ids[] =
+{
+    { RT_PCI_DEVICE_ID(PCI_VENDOR_ID_REDHAT, 0x0010) },
+    { RT_PCI_DEVICE_CLASS(PCIS_STORAGE_EXPRESS, ~0) },
+    { /* sentinel */ }
+};
+
+static struct rt_pci_driver pci_nvme_driver =
+{
+    .name = "nvme-pci",
+
+    .ids = pci_nvme_ids,
+    .probe = pci_nvme_probe,
+    .remove = pci_nvme_remove,
+    .shutdown = pci_nvme_shutdown,
+};
+RT_PCI_DRIVER_EXPORT(pci_nvme_driver);

+ 1302 - 0
components/drivers/nvme/nvme.c

@@ -0,0 +1,1302 @@
+/*
+ * Copyright (c) 2006-2023, RT-Thread Development Team
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2023-02-25     GuEe-GUI     the first version
+ */
+
+#include <rthw.h>
+#include <rtthread.h>
+#include <rtdevice.h>
+
+#define DBG_TAG "rtdm.nvme"
+#define DBG_LVL DBG_INFO
+#include <rtdbg.h>
+
+static struct rt_dm_ida nvme_controller_ida = RT_DM_IDA_INIT(CUSTOM);
+static struct rt_dm_ida nvme_ida = RT_DM_IDA_INIT(NVME);
+
+static struct rt_spinlock nvme_lock = {};
+static rt_list_t nvme_nodes = RT_LIST_OBJECT_INIT(nvme_nodes);
+
+rt_inline rt_uint32_t nvme_readl(struct rt_nvme_controller *nvme, int offset)
+{
+    return HWREG32(nvme->regs + offset);
+}
+
+rt_inline void nvme_writel(struct rt_nvme_controller *nvme, int offset, rt_uint32_t value)
+{
+    HWREG32(nvme->regs + offset) = value;
+}
+
+rt_inline rt_uint64_t nvme_readq(struct rt_nvme_controller *nvme, int offset)
+{
+    rt_uint32_t lo32, hi32;
+
+    lo32 = HWREG32(nvme->regs + offset);
+    hi32 = HWREG32(nvme->regs + offset + 4);
+
+    return ((rt_uint64_t)hi32 << 32) + lo32;
+}
+
+rt_inline void nvme_writeq(struct rt_nvme_controller *nvme, int offset, rt_uint64_t value)
+{
+    nvme_writel(nvme, offset, (rt_uint32_t)(value & 0xffffffff));
+    nvme_writel(nvme, offset + 4, (rt_uint32_t)(value >> 32));
+}
+
+static rt_err_t nvme_poll_csts(struct rt_nvme_controller *nvme,
+        rt_uint32_t mask, rt_uint32_t value)
+{
+    rt_tick_t timeout;
+
+    timeout = rt_tick_from_millisecond(RT_NVME_CAP_TIMEOUT(nvme->cap) * 500);
+    timeout += rt_tick_get();
+
+    do {
+        if ((nvme_readl(nvme, RT_NVME_REG_CSTS) & mask) == value)
+        {
+            return RT_EOK;
+        }
+
+        rt_hw_cpu_relax();
+    } while (rt_tick_get() < timeout);
+
+    return -RT_ETIMEOUT;
+}
+
+static rt_err_t nvme_enable_ctrl(struct rt_nvme_controller *nvme)
+{
+    nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK;
+    nvme->ctrl_config |= RT_NVME_CC_ENABLE;
+    nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config);
+
+    return nvme_poll_csts(nvme, RT_NVME_CSTS_RDY, RT_NVME_CSTS_RDY);
+}
+
+static rt_err_t nvme_disable_ctrl(struct rt_nvme_controller *nvme)
+{
+    nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK;
+    nvme->ctrl_config &= ~RT_NVME_CC_ENABLE;
+    nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config);
+
+    return nvme_poll_csts(nvme, RT_NVME_CSTS_RDY, 0);
+}
+
+static rt_err_t nvme_shutdown_ctrl(struct rt_nvme_controller *nvme)
+{
+    nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK;
+    nvme->ctrl_config |= RT_NVME_CC_SHN_NORMAL;
+    nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config);
+
+    return nvme_poll_csts(nvme, RT_NVME_CSTS_SHST_MASK, RT_NVME_CSTS_SHST_CMPLT);
+}
+
+rt_inline rt_le16_t nvme_next_cmdid(struct rt_nvme_controller *nvme)
+{
+    return rt_cpu_to_le16((rt_uint16_t)rt_atomic_add(&nvme->cmdid, 1));
+}
+
+static rt_err_t nvme_submit_cmd(struct rt_nvme_queue *queue,
+        struct rt_nvme_command *cmd)
+{
+    rt_ubase_t level;
+    rt_err_t err = RT_EOK;
+    rt_uint16_t tail, head;
+    struct rt_nvme_controller *nvme = queue->nvme;
+
+_retry:
+    level = rt_spin_lock_irqsave(&queue->lock);
+
+    tail = queue->sq_tail;
+    head = queue->cq_head;
+
+    if (tail + 1 == head)
+    {
+        /* IO queue is full, waiting for the last IO command to complete. */
+        rt_spin_unlock_irqrestore(&queue->lock, level);
+
+        rt_thread_yield();
+
+        goto _retry;
+    }
+
+    cmd->common.cmdid = nvme_next_cmdid(nvme);
+    rt_memcpy(&queue->sq_cmds[tail], cmd, sizeof(*cmd));
+
+    if (nvme->ops->submit_cmd)
+    {
+        if ((err = nvme->ops->submit_cmd(queue, cmd)))
+        {
+            return err;
+        }
+    }
+
+    if (++tail == queue->depth)
+    {
+        tail = 0;
+    }
+    HWREG32(queue->doorbell) = tail;
+    queue->sq_tail = tail;
+
+    queue->cmd = cmd;
+    queue->err = RT_EOK;
+
+    rt_spin_unlock_irqrestore(&queue->lock, level);
+
+    err = rt_completion_wait(&queue->done,
+            rt_tick_from_millisecond(queue->qid != 0 ? RT_WAITING_FOREVER : 60));
+
+    return err ? : queue->err;
+}
+
+static rt_err_t nvme_set_features_simple(struct rt_nvme_controller *nvme,
+        rt_uint32_t fid, rt_uint32_t dword11)
+{
+    struct rt_nvme_command cmd;
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+    cmd.features.opcode = RT_NVME_ADMIN_OPCODE_SET_FEATURES;
+    cmd.features.fid = rt_cpu_to_le32(fid);
+    cmd.features.dword11 = rt_cpu_to_le32(dword11);
+
+    return nvme_submit_cmd(&nvme->admin_queue, &cmd);
+}
+
+static rt_err_t nvme_submit_io_cmd(struct rt_nvme_controller *nvme,
+        struct rt_nvme_command *cmd)
+{
+    rt_uint16_t qid;
+
+    qid = rt_atomic_add(&nvme->ioqid[rt_hw_cpu_id()], RT_CPUS_NR);
+    qid %= nvme->io_queue_max;
+
+    return nvme_submit_cmd(&nvme->io_queues[qid], cmd);
+}
+
+/*
+ * PRP Mode:
+ *
+ * |63                                   n+1|n                0|
+ * +----------------------------------------+----------+---+---+
+ * |            Page Base Address           |  Offset  | 0 | 0 |
+ * +----------------------------------------+----------+---+---+
+ *                                                             |
+ *                                                             v
+ *                                            Host Physical Pages
+ *                                       +----------------------------+
+ * +--------------+----------+           |           Page k           |
+ * |  PRP Entry1  |  Offset  +---------->+----------------------------+
+ * +--------------+----------+           |         Page k + 1         |
+ *                                       +----------------------------+
+ *                                                     ...
+ *                                       +----------------------------+
+ * +--------------+----------+           |         Page k + m         |
+ * |  PRP Entry2  |    0     +---------->+----------------------------+
+ * +--------------+----------+           |       Page k + m + 1       |
+ *                                       +----------------------------+
+ * PRP List (In PRP Entry2):
+ *
+ * |63                                   n+1|n                0|
+ * +----------------------------------------+------------------+
+ * |           Page Base Address k          |        0h        |
+ * +----------------------------------------+------------------+
+ * |        Page Base Address k + 1         |        0h        |
+ * +----------------------------------------+------------------+
+ * |                            ...                            |
+ * +----------------------------------------+------------------+
+ * |        Page Base Address k + m         |        0h        |
+ * +----------------------------------------+------------------+
+ * |       Page Base Address k + m + 1      |        0h        |
+ * +----------------------------------------+------------------+
+ *
+ * SGL Mode:
+ *                                           +----- Non-transport
+ * LBA                                      /
+ * +---------------+---------------+-------/-------+---------------+
+ * |      3KB      |      4KB      |      2KB      |      4KB      |
+ * +-------+-------+-------+-------+---------------+--------+------+
+ *         |               +-------------------------+      |
+ *         |                                         |      |
+ *         |                    +--------------------|------+
+ *         |                    |                    |
+ * +-------v-------+    +-------v-------+    +-------v-------+
+ * |  A MEM BLOCK  |    |  B MEM BLOCK  |    |  C MEM BLOCK  |
+ * +-------^-------+    +-------^-------+    +-------^-------+
+ *         |                    |                    |
+ *         +----------------+   |                    |
+ *                          |   |                    |
+ * Segment(0)               |   |                    |
+ * +----------+----------+  |   |                    |
+ * | Address: A          +--+   |                    |
+ * +----------+----------+      |                    |
+ * | Type: 0h | Len: 3KB |      |                    |
+ * +----------+----------+      |                    |
+ * | Address: Segment(1) +--+   |                    |
+ * +----------+----------+  |   |                    |
+ * | Type: 2h | Len: 48  |  |   |                    |
+ * +----------+----------+  |   |                    |
+ *                          |   |                    |
+ * +------------------------+   |                    |
+ * |                            |                    |
+ * v                            |                    |
+ * Segment(1)                   |                    |
+ * +----------+----------+      |                    |
+ * | Address: B          +------+                    |
+ * +----------+----------+                           |
+ * | Type: 0h | Len: 4KB |                           |
+ * +----------+----------+                           |
+ * | Address: <NULL>     |                           |
+ * +----------+----------+                           |
+ * | Type: 1h | Len: 2KB |                           |
+ * +----------+----------+                           |
+ * | Address: Segment(2) +--+                        |
+ * +----------+----------+  |                        |
+ * | Type: 0h | Len: 16  |  |                        |
+ * +----------+----------+  |                        |
+ *                          |                        |
+ * +------------------------+                        |
+ * |                                                 |
+ * v                                                 |
+ * Segment(2)                                        |
+ * +----------+----------+                           |
+ * | Address: C          +---------------------------+
+ * +----------+----------+
+ * | Type: 0h | Len: 4KB |
+ * +----------+----------+
+ */
+
+static rt_ssize_t nvme_blk_rw(struct rt_nvme_device *ndev, rt_off_t slba,
+        rt_ubase_t buffer_dma, rt_size_t lbas, rt_uint8_t opcode)
+{
+    rt_err_t err;
+    rt_uint16_t max_lbas;
+    rt_uint32_t lba_shift;
+    rt_size_t tlbas;
+    rt_ssize_t data_length;
+    struct rt_nvme_command cmd;
+    struct rt_nvme_controller *nvme = ndev->ctrl;
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+    cmd.rw.opcode = opcode;
+    cmd.rw.flags = nvme->sgl_mode << RT_NVME_CMD_FLAGS_PSDT_SHIFT;
+    cmd.rw.nsid = rt_cpu_to_le32(ndev->nsid);
+
+    tlbas = lbas;
+    lba_shift = ndev->lba_shift;
+    max_lbas = 1 << (nvme->max_transfer_shift - lba_shift);
+
+    if (nvme->sgl_mode)
+    {
+        while ((rt_ssize_t)lbas > 0)
+        {
+            if (lbas < max_lbas)
+            {
+                max_lbas = (rt_uint16_t)lbas;
+            }
+
+            data_length = max_lbas << lba_shift;
+
+            cmd.rw.sgl.adddress = rt_cpu_to_le64(buffer_dma);
+            cmd.rw.sgl.length = rt_cpu_to_le32(data_length);
+            cmd.rw.sgl.sgl_identify = SGL_DESC_TYPE_DATA_BLOCK;
+            cmd.rw.slba = rt_cpu_to_le16(slba);
+            cmd.rw.length = rt_cpu_to_le16(max_lbas - 1);
+
+            if ((err = nvme_submit_io_cmd(nvme, &cmd)))
+            {
+                tlbas -= lbas;
+                break;
+            }
+
+            lbas -= max_lbas;
+            slba += max_lbas;
+            buffer_dma += data_length;
+        }
+    }
+    else
+    {
+        void *prp_list = RT_NULL;
+        rt_size_t prp_list_size = 0, page_size;
+
+        page_size = nvme->page_size;
+
+        while ((rt_ssize_t)lbas > 0)
+        {
+            rt_uint64_t prp2_addr, dma_addr;
+            rt_ssize_t remain_length, page_offset;
+
+            if (lbas < max_lbas)
+            {
+                max_lbas = (rt_uint16_t)lbas;
+            }
+
+            /*
+             * PRP transfer:
+             *  1. data_length <= 4KB:
+             *      prp1 = buffer_dma
+             *      prp2 = 0
+             *
+             *  2. 4KB < data_length <= 8KB:
+             *      prp1 = buffer_dma
+             *      prp2 = buffer_dma
+             *
+             *  3. 8KB < data_length:
+             *      prp1 = buffer_dma(0, 4k)
+             *      prp2 = buffer_dma(4k, ~)
+             */
+            dma_addr = buffer_dma;
+            page_offset = buffer_dma & (page_size - 1);
+            data_length = max_lbas << lba_shift;
+            remain_length = data_length - (page_size - page_offset);
+
+            do {
+                rt_size_t prps_per_page, prps, pages;
+                rt_uint64_t *prp_list_ptr, prp_list_dma;
+
+                if (remain_length <= 0)
+                {
+                    prp2_addr = 0;
+                    break;
+                }
+
+                if (remain_length)
+                {
+                    dma_addr += (page_size - page_offset);
+                }
+
+                if (remain_length <= page_size)
+                {
+                    prp2_addr = dma_addr;
+                    break;
+                }
+
+                prps_per_page = page_size / sizeof(rt_uint64_t);
+                prps = RT_DIV_ROUND_UP(remain_length, page_size);
+                pages = RT_DIV_ROUND_UP(prps - 1, prps_per_page - 1);
+
+                if (prps > prp_list_size)
+                {
+                    if (prp_list)
+                    {
+                        rt_free_align(prp_list);
+                    }
+
+                    prp_list = rt_malloc_align(pages * page_size, page_size);
+
+                    if (!prp_list)
+                    {
+                        LOG_D("No memory to create a PRP List");
+                        /* Ask user to try again */
+                        return tlbas - lbas;
+                    }
+
+                    prp_list_size = pages * (prps_per_page - 1) + 1;
+                }
+                prp_list_ptr = prp_list;
+                prp_list_dma = (rt_uint64_t)rt_kmem_v2p(prp_list_ptr);
+
+                prp2_addr = prp_list_dma;
+
+                for (int i = 0; prps; --prps, ++i)
+                {
+                    /* End of the entry, fill the next entry addr if remain */
+                    if ((i == (prps_per_page - 1)) && prps > 1)
+                    {
+                        prp_list_dma += page_size;
+                        *prp_list_ptr = rt_cpu_to_le64(prp_list_dma);
+
+                        /* Start to fill the next PRP */
+                        i = 0;
+                    }
+
+                    *prp_list_ptr = rt_cpu_to_le64(dma_addr);
+                    dma_addr += page_size;
+                }
+
+                rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, prp_list_ptr, prp_list_size);
+            } while (0);
+
+            cmd.rw.prp1 = rt_cpu_to_le64(buffer_dma);
+            cmd.rw.prp2 = rt_cpu_to_le64(prp2_addr);
+            cmd.rw.slba = rt_cpu_to_le16(slba);
+            cmd.rw.length = rt_cpu_to_le16(max_lbas - 1);
+
+            if ((err = nvme_submit_io_cmd(nvme, &cmd)))
+            {
+                tlbas -= lbas;
+                break;
+            }
+
+            lbas -= max_lbas;
+            slba += max_lbas;
+            buffer_dma += data_length;
+        }
+
+        if (prp_list)
+        {
+            rt_free_align(prp_list);
+        }
+    }
+
+    return tlbas;
+}
+
+static rt_ssize_t nvme_blk_read(struct rt_blk_disk *disk, rt_off_t sector,
+        void *buffer, rt_size_t sector_count)
+{
+    rt_ssize_t res;
+    rt_uint32_t page_bits;
+    rt_size_t buffer_size;
+    rt_ubase_t buffer_dma;
+    void *temp_buffer = RT_NULL;
+    struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
+    struct rt_nvme_controller *nvme = ndev->ctrl;
+
+    buffer_size = (1 << ndev->lba_shift) * sector_count;
+    buffer_dma = (rt_ubase_t)rt_kmem_v2p(buffer);
+
+    if ((nvme->sgl_mode && (buffer_dma & RT_GENMASK(1, 0))) ||
+        (!nvme->sgl_mode && (buffer_dma & ARCH_PAGE_MASK)))
+    {
+        LOG_D("DMA PRP direct %s buffer MUST 4-bytes or page aligned", "read");
+
+        page_bits = rt_page_bits(buffer_size);
+        temp_buffer = rt_pages_alloc(page_bits);
+
+        if (!temp_buffer)
+        {
+            return -RT_ENOMEM;
+        }
+
+        buffer_dma = (rt_ubase_t)rt_kmem_v2p(temp_buffer);
+    }
+
+    res = nvme_blk_rw(ndev, sector, buffer_dma, sector_count, RT_NVME_CMD_READ);
+
+    if (res > 0)
+    {
+        if (res != sector_count)
+        {
+            /*
+             * Don't always aim for optimization, checking for equality
+             * is much faster than multiplication calculation.
+             */
+            buffer_size = res * (1 << ndev->lba_shift);
+        }
+
+        if (temp_buffer)
+        {
+            rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, temp_buffer, buffer_size);
+            rt_memcpy(buffer, temp_buffer, buffer_size);
+        }
+        else
+        {
+            rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, buffer, buffer_size);
+        }
+    }
+
+    if (temp_buffer)
+    {
+        rt_pages_free(temp_buffer, page_bits);
+    }
+
+    return res;
+}
+
+static rt_ssize_t nvme_blk_write(struct rt_blk_disk *disk, rt_off_t sector,
+        const void *buffer, rt_size_t sector_count)
+{
+    rt_ssize_t res;
+    rt_uint32_t page_bits;
+    rt_size_t buffer_size;
+    rt_ubase_t buffer_dma;
+    void *temp_buffer = RT_NULL;
+    struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
+    struct rt_nvme_controller *nvme = ndev->ctrl;
+
+    buffer_size = (1 << ndev->lba_shift) * sector_count;
+    buffer_dma = (rt_ubase_t)rt_kmem_v2p((void *)buffer);
+
+    if ((nvme->sgl_mode && (buffer_dma & RT_GENMASK(1, 0))) ||
+        (!nvme->sgl_mode && (buffer_dma & ARCH_PAGE_MASK)))
+    {
+        LOG_D("DMA PRP direct %s buffer MUST 4-bytes or page aligned", "write");
+
+        page_bits = rt_page_bits(buffer_size);
+        temp_buffer = rt_pages_alloc(page_bits);
+
+        if (!temp_buffer)
+        {
+            return -RT_ENOMEM;
+        }
+
+        buffer_dma = (rt_ubase_t)rt_kmem_v2p(temp_buffer);
+
+        rt_memcpy(temp_buffer, buffer, buffer_size);
+        buffer = temp_buffer;
+    }
+
+    rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, (void *)buffer, buffer_size);
+
+    res = nvme_blk_rw(ndev, sector, buffer_dma, sector_count, RT_NVME_CMD_WRITE);
+
+    if (temp_buffer)
+    {
+        rt_pages_free(temp_buffer, page_bits);
+    }
+
+    return res;
+}
+
+static rt_err_t nvme_blk_getgeome(struct rt_blk_disk *disk,
+        struct rt_device_blk_geometry *geometry)
+{
+    struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
+
+    geometry->bytes_per_sector = 1 << ndev->lba_shift;
+    geometry->block_size = 1 << ndev->lba_shift;
+    geometry->sector_count = rt_le64_to_cpu(ndev->id.nsze);
+
+    return RT_EOK;
+}
+
+static rt_err_t nvme_blk_sync(struct rt_blk_disk *disk)
+{
+    struct rt_nvme_command cmd;
+    struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+    cmd.common.opcode = RT_NVME_CMD_FLUSH;
+    cmd.common.nsid = rt_cpu_to_le32(ndev->nsid);
+
+    return nvme_submit_io_cmd(ndev->ctrl, &cmd);
+}
+
+static rt_err_t nvme_blk_erase(struct rt_blk_disk *disk)
+{
+    rt_err_t err;
+    rt_ssize_t slba, lbas, max_lbas;
+    struct rt_nvme_command cmd;
+    struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
+    struct rt_nvme_controller *nvme = ndev->ctrl;
+
+    if (!nvme->write_zeroes)
+    {
+        return -RT_ENOSYS;
+    }
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+    cmd.write_zeroes.opcode = RT_NVME_CMD_WRITE_ZEROES;
+    cmd.write_zeroes.nsid = rt_cpu_to_le32(ndev->nsid);
+
+    slba = 0;
+    lbas = rt_le64_to_cpu(ndev->id.nsze);
+    max_lbas = 1 << (nvme->max_transfer_shift - ndev->lba_shift);
+
+    while ((rt_ssize_t)lbas > 0)
+    {
+        if (lbas < max_lbas)
+        {
+            max_lbas = (rt_uint16_t)lbas;
+        }
+
+        cmd.write_zeroes.slba = rt_cpu_to_le16(slba);
+        cmd.write_zeroes.length = rt_cpu_to_le16(max_lbas - 1);
+
+        if ((err = nvme_submit_io_cmd(nvme, &cmd)))
+        {
+            break;
+        }
+
+        lbas -= max_lbas;
+        slba += max_lbas;
+    }
+
+    return err;
+}
+
+static rt_err_t nvme_blk_autorefresh(struct rt_blk_disk *disk, rt_bool_t is_auto)
+{
+    struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
+    struct rt_nvme_controller *nvme = ndev->ctrl;
+
+    if (nvme->volatile_write_cache & RT_NVME_CTRL_VWC_PRESENT)
+    {
+        return nvme_set_features_simple(nvme, RT_NVME_FEAT_VOLATILE_WC, !!is_auto);
+    }
+    else if (!is_auto)
+    {
+        return RT_EOK;
+    }
+
+    return -RT_ENOSYS;
+}
+
+static const struct rt_blk_disk_ops nvme_blk_ops =
+{
+    .read = nvme_blk_read,
+    .write = nvme_blk_write,
+    .getgeome = nvme_blk_getgeome,
+    .sync = nvme_blk_sync,
+    .erase = nvme_blk_erase,
+    .autorefresh = nvme_blk_autorefresh,
+};
+
+static void nvme_queue_isr(int irqno, void *param)
+{
+    rt_ubase_t level;
+    rt_uint16_t head, phase, status;
+    struct rt_nvme_queue *queue = param;
+    struct rt_nvme_controller *nvme = queue->nvme;
+
+    level = rt_spin_lock_irqsave(&queue->lock);
+
+    head = queue->cq_head;
+    phase = queue->cq_phase;
+    status = HWREG16(&queue->cq_entry[head].status);
+    status = rt_le16_to_cpu(status);
+
+    if ((status & 0x01) == phase)
+    {
+        if ((status >> 1))
+        {
+            queue->err = -RT_EIO;
+            goto _end_cmd;
+        }
+
+        if (nvme->ops->complete_cmd)
+        {
+            nvme->ops->complete_cmd(queue, queue->cmd);
+        }
+
+    _end_cmd:
+        if (++head == queue->depth)
+        {
+            head = 0;
+            phase = !phase;
+        }
+
+        HWREG32(queue->doorbell + nvme->doorbell_stride) = head;
+        queue->cq_head = head;
+        queue->cq_phase = phase;
+
+        rt_completion_done(&queue->done);
+    }
+
+    rt_spin_unlock_irqrestore(&queue->lock, level);
+}
+
+static rt_err_t nvme_identify(struct rt_nvme_controller *nvme,
+        rt_uint32_t nsid, rt_uint32_t cns, void *data)
+{
+    rt_err_t err;
+    rt_uint32_t page_size = nvme->page_size;
+    rt_ubase_t data_phy = (rt_ubase_t)rt_kmem_v2p(data);
+    int offset = data_phy & (page_size - 1);
+    struct rt_nvme_command cmd;
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+    cmd.identify.opcode = RT_NVME_ADMIN_OPCODE_IDENTIFY;
+    cmd.identify.nsid = rt_cpu_to_le32(nsid);
+    cmd.identify.prp1 = rt_cpu_to_le64(data_phy);
+
+    if (sizeof(struct rt_nvme_id_ctrl) <= page_size - offset)
+    {
+        cmd.identify.prp2 = 0;
+    }
+    else
+    {
+        data_phy += (page_size - offset);
+        cmd.identify.prp2 = rt_cpu_to_le64(data_phy);
+    }
+    cmd.identify.cns = rt_cpu_to_le32(cns);
+
+    rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, data, sizeof(struct rt_nvme_id_ctrl));
+
+    if (!(err = nvme_submit_cmd(&nvme->admin_queue, &cmd)))
+    {
+        rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, data, sizeof(struct rt_nvme_id_ctrl));
+    }
+
+    return err;
+}
+
+static rt_err_t nvme_attach_queue(struct rt_nvme_queue *queue, rt_uint8_t opcode)
+{
+    struct rt_nvme_command cmd;
+    struct rt_nvme_controller *nvme = queue->nvme;
+    rt_uint16_t flags = RT_NVME_QUEUE_PHYS_CONTIG;
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+
+    if (opcode == RT_NVME_ADMIN_OPCODE_CREATE_CQ)
+    {
+        cmd.create_cq.opcode = opcode;
+        cmd.create_cq.prp1 = rt_cpu_to_le64(queue->cq_entry_phy);
+        cmd.create_cq.cqid = rt_cpu_to_le16(queue->qid);
+        cmd.create_cq.qsize = rt_cpu_to_le16(queue->depth - 1);
+        cmd.create_cq.cq_flags = rt_cpu_to_le16(flags | RT_NVME_CQ_IRQ_ENABLED);
+        cmd.create_cq.irq_vector = rt_cpu_to_le16(nvme->irqs_nr > 1 ? queue->qid : 0);
+    }
+    else if (opcode == RT_NVME_ADMIN_OPCODE_CREATE_SQ)
+    {
+        cmd.create_sq.opcode = opcode;
+        cmd.create_sq.prp1 = rt_cpu_to_le64(queue->sq_cmds_phy);
+        cmd.create_sq.sqid = rt_cpu_to_le16(queue->qid);
+        cmd.create_sq.qsize = rt_cpu_to_le16(queue->depth - 1);
+        cmd.create_sq.sq_flags = rt_cpu_to_le16(flags | RT_NVME_SQ_PRIO_MEDIUM);
+        cmd.create_sq.cqid = rt_cpu_to_le16(queue->qid);
+    }
+    else
+    {
+        LOG_E("What the fuck opcode = %x", opcode);
+        RT_ASSERT(0);
+    }
+
+    return nvme_submit_cmd(&nvme->admin_queue, &cmd);
+}
+
+rt_inline rt_err_t nvme_attach_queue_sq(struct rt_nvme_queue *queue)
+{
+    return nvme_attach_queue(queue, RT_NVME_ADMIN_OPCODE_CREATE_SQ);
+}
+
+rt_inline rt_err_t nvme_attach_queue_cq(struct rt_nvme_queue *queue)
+{
+    return nvme_attach_queue(queue, RT_NVME_ADMIN_OPCODE_CREATE_CQ);
+}
+
+static rt_err_t nvme_detach_queue(struct rt_nvme_queue *queue,
+        rt_uint8_t opcode)
+{
+    struct rt_nvme_command cmd;
+    struct rt_nvme_controller *nvme = queue->nvme;
+
+    rt_memset(&cmd, 0, sizeof(cmd));
+    cmd.delete_queue.opcode = opcode;
+    cmd.delete_queue.qid = rt_cpu_to_le16(queue->qid);
+
+    return nvme_submit_cmd(&nvme->admin_queue, &cmd);
+}
+
+rt_inline rt_ubase_t nvme_queue_dma_flags(void)
+{
+    return RT_DMA_F_NOCACHE | RT_DMA_F_LINEAR;
+}
+
+static void nvme_free_queue(struct rt_nvme_queue *queue)
+{
+    rt_ubase_t dma_flags;
+    struct rt_nvme_controller *nvme = queue->nvme;
+
+    if (nvme->ops->cleanup_queue)
+    {
+        rt_err_t err;
+
+        if (!(err = nvme->ops->cleanup_queue(queue)))
+        {
+            LOG_W("Cleanup[%s] queue error = %s", nvme->ops->name, rt_strerror(err));
+        }
+    }
+
+    dma_flags = nvme_queue_dma_flags();
+
+    if (queue->sq_cmds)
+    {
+        rt_dma_free(nvme->dev, sizeof(*queue->sq_cmds) * queue->depth,
+                queue->sq_cmds, queue->sq_cmds_phy, dma_flags);
+    }
+
+    if (queue->cq_entry)
+    {
+        rt_dma_free(nvme->dev, sizeof(*queue->cq_entry) * queue->depth,
+                queue->cq_entry, queue->cq_entry_phy, dma_flags);
+    }
+}
+
+static struct rt_nvme_queue *nvme_alloc_queue(struct rt_nvme_controller *nvme,
+        int qid, int depth)
+{
+    rt_err_t err;
+    rt_ubase_t dma_flags;
+    struct rt_nvme_queue *queue = &nvme->queue[qid];
+
+    rt_memset(queue, 0, sizeof(*queue));
+
+    queue->nvme = nvme;
+    queue->doorbell = &nvme->doorbell_tbl[qid * 2 * nvme->doorbell_stride];
+    queue->qid = qid;
+    queue->depth = depth;
+    queue->cq_head = 0;
+    queue->cq_phase = 1;
+    rt_completion_init(&queue->done);
+    rt_spin_lock_init(&queue->lock);
+
+    dma_flags = nvme_queue_dma_flags();
+
+    /* struct rt_nvme_command */
+    queue->sq_cmds = rt_dma_alloc(nvme->dev,
+            sizeof(*queue->sq_cmds) * depth, &queue->sq_cmds_phy, dma_flags);
+
+    if (!queue->sq_cmds)
+    {
+        err = -RT_ENOMEM;
+        goto _fail;
+    }
+
+    /* struct rt_nvme_completion */
+    queue->cq_entry = rt_dma_alloc(nvme->dev,
+            sizeof(*queue->cq_entry) * depth, &queue->cq_entry_phy, dma_flags);
+
+    if (!queue->cq_entry)
+    {
+        err = -RT_ENOMEM;
+        goto _fail;
+    }
+
+    rt_memset(queue->sq_cmds, 0, sizeof(struct rt_nvme_command) * depth);
+    rt_memset(queue->cq_entry, 0, sizeof(struct rt_nvme_completion) * depth);
+
+    if (nvme->ops->setup_queue)
+    {
+        if (!(err = nvme->ops->setup_queue(queue)))
+        {
+            LOG_E("Setup[%s] queue error = %s", nvme->ops->name, rt_strerror(err));
+
+            goto _fail;
+        }
+    }
+
+    return queue;
+
+_fail:
+    nvme_free_queue(queue);
+
+    return rt_err_ptr(err);
+}
+
+static rt_err_t nvme_configure_admin_queue(struct rt_nvme_controller *nvme)
+{
+    rt_err_t err;
+    int irq;
+    char name[RT_NAME_MAX];
+    rt_uint32_t aqa;
+    rt_uint32_t page_shift = ARCH_PAGE_SHIFT;
+    rt_uint32_t page_min = RT_NVME_CAP_MPSMIN(nvme->cap) + 12;
+    rt_uint32_t page_max = RT_NVME_CAP_MPSMAX(nvme->cap) + 12;
+    struct rt_nvme_queue *admin_queue;
+
+    if (page_shift < page_min)
+    {
+        LOG_E("Device %s page size (%u) %s than host (%u)",
+                "minimum", 1 << page_min, "larger", 1 << page_shift);
+        return -RT_EINVAL;
+    }
+
+    if (page_shift > page_max)
+    {
+        LOG_W("Device %s page size (%u) %s than host (%u)",
+                "maximum", 1 << page_max, "smaller", 1 << page_shift);
+        page_shift = page_max;
+    }
+
+    if ((err = nvme_disable_ctrl(nvme)))
+    {
+        return err;
+    }
+
+    admin_queue = nvme_alloc_queue(nvme, 0, RT_NVME_AQ_DEPTH);
+
+    if (rt_is_err(admin_queue))
+    {
+        return rt_ptr_err(admin_queue);
+    }
+
+    aqa = admin_queue->depth - 1;
+    aqa |= aqa << 16;
+
+    nvme->page_shift = page_shift;
+    nvme->page_size = 1U << page_shift;
+
+    nvme->ctrl_config = RT_NVME_CC_CSS_NVM;
+    nvme->ctrl_config |= (page_shift - 12) << RT_NVME_CC_MPS_SHIFT;
+    nvme->ctrl_config |= RT_NVME_CC_ARB_RR | RT_NVME_CC_SHN_NONE;
+    nvme->ctrl_config |= RT_NVME_CC_IOSQES | RT_NVME_CC_IOCQES;
+
+    nvme_writel(nvme, RT_NVME_REG_AQA, aqa);
+    nvme_writeq(nvme, RT_NVME_REG_ASQ, admin_queue->sq_cmds_phy);
+    nvme_writeq(nvme, RT_NVME_REG_ACQ, admin_queue->cq_entry_phy);
+
+    if ((err = nvme_enable_ctrl(nvme)))
+    {
+        nvme_free_queue(admin_queue);
+
+        return err;
+    }
+
+    irq = nvme->irqs[0];
+
+    rt_snprintf(name, RT_NAME_MAX, "%s-admin-queue", nvme->name);
+
+    rt_hw_interrupt_install(irq, nvme_queue_isr, &nvme->admin_queue, name);
+    rt_hw_interrupt_umask(irq);
+
+    return RT_EOK;
+}
+
+static rt_err_t nvme_setup_io_queues(struct rt_nvme_controller *nvme)
+{
+    rt_err_t err;
+    rt_uint32_t value;
+    int irq, cpuid = 0;
+    char name[RT_NAME_MAX];
+    rt_bool_t affinity_fixup = RT_FALSE;
+    RT_DECLARE_IRQ_AFFINITY(affinity) = { 0 };
+    struct rt_nvme_queue *queue;
+
+    nvme->io_queue_max = nvme->irqs_nr > 1 ? nvme->irqs_nr - 1 : 1;
+    value = (nvme->io_queue_max - 1) | ((nvme->io_queue_max - 1) << 16);
+
+    if ((err = nvme_set_features_simple(nvme, RT_NVME_FEAT_NUM_QUEUES, value)))
+    {
+        return err;
+    }
+
+    for (int i = 0, q_idx = 1; i < nvme->io_queue_max; ++i, ++q_idx)
+    {
+        queue = nvme_alloc_queue(nvme, q_idx, nvme->queue_depth);
+
+        if (!queue)
+        {
+            return -RT_ENOMEM;
+        }
+
+        if ((err = nvme_attach_queue_cq(queue)) ||
+            (err = nvme_attach_queue_sq(queue)))
+        {
+            return err;
+        }
+    }
+
+    for (int i = 0, irq_idx = 1; i < nvme->io_queue_max; ++i, ++irq_idx)
+    {
+        irq = nvme->irqs[irq_idx % nvme->irqs_nr];
+
+        rt_snprintf(name, RT_NAME_MAX, "%s-io-queue%d", nvme->name, i);
+
+        if (!affinity_fixup)
+        {
+            RT_IRQ_AFFINITY_SET(affinity, cpuid % RT_CPUS_NR);
+            if (rt_pic_irq_set_affinity(irq, affinity))
+            {
+                /* Fixup in secondary CPU startup */
+                affinity_fixup = RT_TRUE;
+            }
+            RT_IRQ_AFFINITY_CLEAR(affinity, cpuid++ % RT_CPUS_NR);
+        }
+
+        rt_hw_interrupt_install(irq, nvme_queue_isr, &nvme->io_queues[i], name);
+        rt_hw_interrupt_umask(irq);
+    }
+
+    return RT_EOK;
+}
+
+static void nvme_remove_io_queues(struct rt_nvme_controller *nvme)
+{
+    int irq;
+    struct rt_nvme_queue *queue;
+
+    for (int i = 0, irq_idx = 1; i < nvme->io_queue_max; ++i, ++irq_idx)
+    {
+        queue = &nvme->io_queues[i];
+
+        nvme_detach_queue(queue, RT_NVME_ADMIN_OPCODE_DELETE_SQ);
+        nvme_detach_queue(queue, RT_NVME_ADMIN_OPCODE_DELETE_CQ);
+        nvme_free_queue(queue);
+
+        irq = nvme->irqs[irq_idx % nvme->irqs_nr];
+
+        rt_hw_interrupt_mask(irq);
+        rt_pic_detach_irq(irq, queue);
+    }
+}
+
+static void nvme_remove_admin_queues(struct rt_nvme_controller *nvme)
+{
+    int irq = nvme->irqs[0];
+
+    rt_hw_interrupt_mask(irq);
+    rt_pic_detach_irq(irq, &nvme->admin_queue);
+
+    nvme_free_queue(&nvme->admin_queue);
+}
+
+static void nvme_remove_devices(struct rt_nvme_controller *nvme)
+{
+    struct rt_nvme_device *ndev, *next_ndev;
+
+    rt_list_for_each_entry_safe(ndev, next_ndev, &nvme->ns_nodes, list)
+    {
+        rt_list_remove(&ndev->list);
+
+        rt_hw_blk_disk_unregister(&ndev->parent);
+        rt_free(ndev);
+    }
+}
+
+static rt_err_t nvme_scan_device(struct rt_nvme_controller *nvme,
+        rt_size_t number_of_ns)
+{
+    rt_err_t err;
+    rt_uint32_t lbaf;
+    struct rt_nvme_id_ns *id = RT_NULL;
+
+    if (!(id = rt_malloc_align(sizeof(*id), nvme->page_size)))
+    {
+        return -RT_ENOMEM;
+    }
+
+    /* NVME Namespace is start with "1" */
+    for (rt_uint32_t nsid = 1; nsid <= number_of_ns; ++nsid)
+    {
+        struct rt_nvme_device *ndev = rt_calloc(1, sizeof(*ndev));
+
+        if (!ndev)
+        {
+            err = -RT_ENOMEM;
+            goto _free_res;
+        }
+
+        rt_memset(id, 0, sizeof(*id));
+        if ((err = nvme_identify(nvme, nsid, 0, id)))
+        {
+            goto _free_res;
+        }
+
+        if (!id->nsze)
+        {
+            continue;
+        }
+
+        ndev->ctrl = nvme;
+
+        rt_memcpy(&ndev->id, id, sizeof(ndev->id));
+        lbaf = id->flbas & RT_NVME_NS_FLBAS_LBA_MASK;
+        lbaf |= ((id->flbas & RT_NVME_NS_FLBAS_LBA_UMASK) >> RT_NVME_NS_FLBAS_LBA_SHIFT);
+
+        ndev->nsid = nsid;
+        ndev->lba_shift = id->lbaf[lbaf].ds;
+
+        ndev->parent.ida = &nvme_ida;
+        ndev->parent.parallel_io = RT_TRUE;
+        ndev->parent.ops = &nvme_blk_ops;
+        ndev->parent.max_partitions = RT_BLK_PARTITION_MAX;
+        rt_dm_dev_set_name(&ndev->parent.parent, "%sn%u", nvme->name, nsid);
+
+        if ((err = rt_hw_blk_disk_register(&ndev->parent)))
+        {
+            goto _free_res;
+        }
+
+        rt_list_init(&ndev->list);
+        rt_list_insert_before(&nvme->ns_nodes, &ndev->list);
+    }
+
+_free_res:
+    rt_free_align(id);
+
+    return err;
+}
+
+rt_inline rt_size_t strip_len(const char *str, rt_size_t max_len)
+{
+    rt_size_t size = 0;
+
+    for (int i = 0; *str && i < max_len; ++i, ++str)
+    {
+        if (*str != ' ')
+        {
+            size = i + 1;
+        }
+    }
+
+    return size;
+}
+
+rt_err_t rt_nvme_controller_register(struct rt_nvme_controller *nvme)
+{
+    rt_err_t err;
+    struct rt_nvme_id_ctrl *ctrl = RT_NULL;
+
+    if (!nvme || !nvme->ops)
+    {
+        return -RT_EINVAL;
+    }
+
+    if (nvme_readl(nvme, RT_NVME_REG_CSTS) == (rt_uint32_t)-1)
+    {
+        LOG_E("Out of memory");
+
+        return -RT_EINVAL;
+    }
+
+    if ((nvme->nvme_id = rt_dm_ida_alloc(&nvme_controller_ida)) < 0)
+    {
+        return -RT_EFULL;
+    }
+
+    rt_snprintf(nvme->name, RT_NAME_MAX, "nvme%u", nvme->nvme_id);
+
+    nvme->cap = nvme_readq(nvme, RT_NVME_REG_CAP);
+    nvme->queue_depth = RT_NVME_CAP_MQES(nvme->cap) + 1;
+    nvme->doorbell_stride = 1 << RT_NVME_CAP_STRIDE(nvme->cap);
+    nvme->doorbell_tbl = nvme->regs + RT_NVME_REG_DBS;
+
+    if ((err = nvme_configure_admin_queue(nvme)))
+    {
+        LOG_E("Configure admin queue error = %s", rt_strerror(err));
+        goto _free_admin_queue;
+    }
+
+    if ((err = nvme_setup_io_queues(nvme)))
+    {
+        LOG_E("Unable to setup I/O queues error = %s", rt_strerror(err));
+        goto _free_admin_queue;
+    }
+
+    if (!(ctrl = rt_malloc_align(sizeof(*ctrl), nvme->page_size)))
+    {
+        err = -RT_ENOMEM;
+        goto _fail;
+    }
+
+    if ((err = nvme_identify(nvme, 0, 1, ctrl)))
+    {
+        goto _fail;
+    }
+
+    if (ctrl->mdts)
+    {
+        nvme->max_transfer_shift = ctrl->mdts + (RT_NVME_CAP_MPSMIN(nvme->cap) + 12);
+    }
+    else
+    {
+        /* 1MB is recommended. */
+        nvme->max_transfer_shift = 20;
+    }
+    nvme->volatile_write_cache = ctrl->vwc;
+    nvme->write_zeroes = !!(rt_le64_to_cpu(ctrl->oncs) & RT_NVME_CTRL_ONCS_WRITE_ZEROES);
+
+    if ((rt_le32_to_cpu(ctrl->sgls) & RT_NVME_ID_SGL_SUPPORT_MASK))
+    {
+        nvme->sgl_mode = RT_NVME_PSDT_SGL_MPTR_SGL;
+    }
+
+    LOG_I("NVM Express v%d.%d (%s, %-*.s, %-*.s)",
+            nvme_readl(nvme, RT_NVME_REG_VS) >> 16,
+            nvme_readl(nvme, RT_NVME_REG_VS) & 0xff,
+            nvme->ops->name,
+            strip_len(ctrl->mn, sizeof(ctrl->mn)), ctrl->mn,
+            strip_len(ctrl->fr, sizeof(ctrl->fr)), ctrl->fr);
+
+    rt_list_init(&nvme->ns_nodes);
+    if ((err = nvme_scan_device(nvme, rt_le32_to_cpu(ctrl->nn))))
+    {
+        goto _fail;
+    }
+
+    rt_free_align(ctrl);
+
+    rt_spin_lock(&nvme_lock);
+    rt_list_insert_after(&nvme_nodes, &nvme->list);
+    rt_spin_unlock(&nvme_lock);
+
+    return RT_EOK;
+
+_fail:
+    if (ctrl)
+    {
+        rt_free_align(ctrl);
+    }
+    nvme_remove_devices(nvme);
+    nvme_remove_io_queues(nvme);
+_free_admin_queue:
+    nvme_remove_admin_queues(nvme);
+
+    rt_dm_ida_free(&nvme_controller_ida, nvme->nvme_id);
+
+    return err;
+}
+
+rt_err_t rt_nvme_controller_unregister(struct rt_nvme_controller *nvme)
+{
+    rt_err_t err;
+
+    if (!nvme)
+    {
+        return -RT_EINVAL;
+    }
+
+    rt_spin_lock(&nvme_lock);
+    rt_list_remove(&nvme->list);
+    rt_spin_unlock(&nvme_lock);
+
+    nvme_remove_devices(nvme);
+    nvme_remove_io_queues(nvme);
+    nvme_remove_admin_queues(nvme);
+
+    rt_dm_ida_free(&nvme_controller_ida, nvme->nvme_id);
+
+    if (!(err = nvme_shutdown_ctrl(nvme)))
+    {
+        err = nvme_disable_ctrl(nvme);
+    }
+    else
+    {
+        LOG_E("%s: shutdown error = %s", nvme->name, rt_strerror(err));
+    }
+
+    return err;
+}
+
+/*
+ * NVME's IO queue should be Per-CPU, fixup the affinity after the secondary CPU
+ * startup, this stage can make sure the affinity setting success as possible.
+ */
+static int nvme_queue_affinify_fixup(void)
+{
+    int cpuid = rt_hw_cpu_id();
+    struct rt_nvme_controller *nvme;
+    RT_DECLARE_IRQ_AFFINITY(affinity) = { 0 };
+    RT_DECLARE_IRQ_AFFINITY(current_affinity) = { 0 };
+
+    RT_IRQ_AFFINITY_SET(affinity, cpuid);
+
+    rt_hw_spin_lock(&nvme_lock.lock);
+    rt_list_for_each_entry(nvme, &nvme_nodes, list)
+    {
+        for (int i = cpuid % RT_CPUS_NR; i < nvme->io_queue_max; i += RT_CPUS_NR)
+        {
+            int irq = nvme->irqs[i];
+
+            if (!rt_pic_irq_get_affinity(irq, current_affinity) &&
+                !rt_bitmap_test_bit(current_affinity, cpuid))
+            {
+                rt_ubase_t level = rt_hw_interrupt_disable();
+
+                rt_pic_irq_set_affinity(irq, affinity);
+
+                rt_hw_interrupt_enable(level);
+            }
+        }
+    }
+    rt_hw_spin_unlock(&nvme_lock.lock);
+
+    return 0;
+}
+INIT_SECONDARY_CPU_EXPORT(nvme_queue_affinify_fixup);

+ 3 - 0
include/rtdef.h

@@ -1378,6 +1378,9 @@ struct rt_device
     void *ofw_node;                                     /**< ofw node get from device tree */
 #endif /* RT_USING_OFW */
     void *power_domain_unit;
+#ifdef RT_USING_DMA
+    const void *dma_ops;
+#endif
 #endif /* RT_USING_DM */
 
     enum rt_device_class_type type;                     /**< device type */