~kameliya/qemu

8a40754bca14df63c6d2ffe473b68a270dc50679 — Peter Maydell a month ago 1b507e5 + dc04d25
Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into staging

emulated nvme updates and fixes

* fixes for Coverity CID 1450756, 1450757 and 1450758 (me)
* fix for a bug in zone management receive (me)
* metadata and end-to-end data protection support (me & Gollu Appalanaidu)
* verify support (Gollu Appalanaidu)
* multiple lba formats and format nvm support (Minwoo Im)

and a couple of misc refactorings from me.

v2:
  - remove an unintended submodule update. Argh.

# gpg: Signature made Thu 18 Mar 2021 11:53:48 GMT
# gpg:                using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9
# gpg: Good signature from "Klaus Jensen <its@irrelevant.dk>" [unknown]
# gpg:                 aka "Klaus Jensen <k.jensen@samsung.com>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468  4272 63D5 6FC5 E55D A838
#      Subkey fingerprint: 5228 33AA 75E2 DCE6 A247  66C0 4DE1 AF31 6D4F 0DE9

* remotes/nvme/tags/nvme-next-pull-request:
  hw/block/nvme: add support for the format nvm command
  hw/block/nvme: pull lba format initialization
  hw/block/nvme: prefer runtime helpers instead of device parameters
  hw/block/nvme: support multiple lba formats
  hw/block/nvme: add non-mdts command size limit for verify
  hw/block/nvme: add verify command
  hw/block/nvme: end-to-end data protection
  hw/block/nvme: add metadata support
  hw/block/nvme: fix zone management receive reporting too many zones
  hw/block/nvme: assert namespaces array indices
  hw/block/nvme: fix potential overflow

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
M hw/block/meson.build => hw/block/meson.build +1 -1
@@ 13,7 13,7 @@ softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c'))
softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c'))
softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c', 'nvme-subsys.c'))
softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c', 'nvme-subsys.c', 'nvme-dif.c'))

specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c'))

A hw/block/nvme-dif.c => hw/block/nvme-dif.c +508 -0
@@ 0,0 1,508 @@
#include "qemu/osdep.h"
#include "hw/block/block.h"
#include "sysemu/dma.h"
#include "sysemu/block-backend.h"
#include "qapi/error.h"
#include "trace.h"
#include "nvme.h"
#include "nvme-dif.h"

uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint16_t ctrl, uint64_t slba,
                           uint32_t reftag)
{
    if ((NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) == NVME_ID_NS_DPS_TYPE_1) &&
        (ctrl & NVME_RW_PRINFO_PRCHK_REF) && (slba & 0xffffffff) != reftag) {
        return NVME_INVALID_PROT_INFO | NVME_DNR;
    }

    return NVME_SUCCESS;
}

/* from Linux kernel (crypto/crct10dif_common.c) */
static uint16_t crc_t10dif(uint16_t crc, const unsigned char *buffer,
                           size_t len)
{
    unsigned int i;

    for (i = 0; i < len; i++) {
        crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff];
    }

    return crc;
}

void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
                                 uint8_t *mbuf, size_t mlen, uint16_t apptag,
                                 uint32_t reftag)
{
    uint8_t *end = buf + len;
    size_t lsize = nvme_lsize(ns);
    size_t msize = nvme_msize(ns);
    int16_t pil = 0;

    if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
        pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
    }

    trace_pci_nvme_dif_pract_generate_dif(len, lsize, lsize + pil, apptag,
                                          reftag);

    for (; buf < end; buf += lsize, mbuf += msize) {
        NvmeDifTuple *dif = (NvmeDifTuple *)(mbuf + pil);
        uint16_t crc = crc_t10dif(0x0, buf, lsize);

        if (pil) {
            crc = crc_t10dif(crc, mbuf, pil);
        }

        dif->guard = cpu_to_be16(crc);
        dif->apptag = cpu_to_be16(apptag);
        dif->reftag = cpu_to_be32(reftag);

        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) != NVME_ID_NS_DPS_TYPE_3) {
            reftag++;
        }
    }
}

static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
                               uint8_t *buf, uint8_t *mbuf, size_t pil,
                               uint16_t ctrl, uint16_t apptag,
                               uint16_t appmask, uint32_t reftag)
{
    switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
    case NVME_ID_NS_DPS_TYPE_3:
        if (be32_to_cpu(dif->reftag) != 0xffffffff) {
            break;
        }

        /* fallthrough */
    case NVME_ID_NS_DPS_TYPE_1:
    case NVME_ID_NS_DPS_TYPE_2:
        if (be16_to_cpu(dif->apptag) != 0xffff) {
            break;
        }

        trace_pci_nvme_dif_prchk_disabled(be16_to_cpu(dif->apptag),
                                          be32_to_cpu(dif->reftag));

        return NVME_SUCCESS;
    }

    if (ctrl & NVME_RW_PRINFO_PRCHK_GUARD) {
        uint16_t crc = crc_t10dif(0x0, buf, nvme_lsize(ns));

        if (pil) {
            crc = crc_t10dif(crc, mbuf, pil);
        }

        trace_pci_nvme_dif_prchk_guard(be16_to_cpu(dif->guard), crc);

        if (be16_to_cpu(dif->guard) != crc) {
            return NVME_E2E_GUARD_ERROR;
        }
    }

    if (ctrl & NVME_RW_PRINFO_PRCHK_APP) {
        trace_pci_nvme_dif_prchk_apptag(be16_to_cpu(dif->apptag), apptag,
                                        appmask);

        if ((be16_to_cpu(dif->apptag) & appmask) != (apptag & appmask)) {
            return NVME_E2E_APP_ERROR;
        }
    }

    if (ctrl & NVME_RW_PRINFO_PRCHK_REF) {
        trace_pci_nvme_dif_prchk_reftag(be32_to_cpu(dif->reftag), reftag);

        if (be32_to_cpu(dif->reftag) != reftag) {
            return NVME_E2E_REF_ERROR;
        }
    }

    return NVME_SUCCESS;
}

uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
                        uint8_t *mbuf, size_t mlen, uint16_t ctrl,
                        uint64_t slba, uint16_t apptag,
                        uint16_t appmask, uint32_t reftag)
{
    uint8_t *end = buf + len;
    size_t lsize = nvme_lsize(ns);
    size_t msize = nvme_msize(ns);
    int16_t pil = 0;
    uint16_t status;

    status = nvme_check_prinfo(ns, ctrl, slba, reftag);
    if (status) {
        return status;
    }

    if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
        pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
    }

    trace_pci_nvme_dif_check(NVME_RW_PRINFO(ctrl), lsize + pil);

    for (; buf < end; buf += lsize, mbuf += msize) {
        NvmeDifTuple *dif = (NvmeDifTuple *)(mbuf + pil);

        status = nvme_dif_prchk(ns, dif, buf, mbuf, pil, ctrl, apptag,
                                appmask, reftag);
        if (status) {
            return status;
        }

        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) != NVME_ID_NS_DPS_TYPE_3) {
            reftag++;
        }
    }

    return NVME_SUCCESS;
}

uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
                               uint64_t slba)
{
    BlockBackend *blk = ns->blkconf.blk;
    BlockDriverState *bs = blk_bs(blk);

    size_t msize = nvme_msize(ns);
    size_t lsize = nvme_lsize(ns);
    int64_t moffset = 0, offset = nvme_l2b(ns, slba);
    uint8_t *mbufp, *end;
    bool zeroed;
    int16_t pil = 0;
    int64_t bytes = (mlen / msize) * lsize;
    int64_t pnum = 0;

    Error *err = NULL;


    if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
        pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
    }

    do {
        int ret;

        bytes -= pnum;

        ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
        if (ret < 0) {
            error_setg_errno(&err, -ret, "unable to get block status");
            error_report_err(err);

            return NVME_INTERNAL_DEV_ERROR;
        }

        zeroed = !!(ret & BDRV_BLOCK_ZERO);

        trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);

        if (zeroed) {
            mbufp = mbuf + moffset;
            mlen = (pnum / lsize) * msize;
            end = mbufp + mlen;

            for (; mbufp < end; mbufp += msize) {
                memset(mbufp + pil, 0xff, sizeof(NvmeDifTuple));
            }
        }

        moffset += (pnum / lsize) * msize;
        offset += pnum;
    } while (pnum != bytes);

    return NVME_SUCCESS;
}

static void nvme_dif_rw_cb(void *opaque, int ret)
{
    NvmeBounceContext *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    BlockBackend *blk = ns->blkconf.blk;

    trace_pci_nvme_dif_rw_cb(nvme_cid(req), blk_name(blk));

    qemu_iovec_destroy(&ctx->data.iov);
    g_free(ctx->data.bounce);

    qemu_iovec_destroy(&ctx->mdata.iov);
    g_free(ctx->mdata.bounce);

    g_free(ctx);

    nvme_rw_complete_cb(req, ret);
}

static void nvme_dif_rw_check_cb(void *opaque, int ret)
{
    NvmeBounceContext *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    NvmeCtrl *n = nvme_ctrl(req);
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint16_t apptag = le16_to_cpu(rw->apptag);
    uint16_t appmask = le16_to_cpu(rw->appmask);
    uint32_t reftag = le32_to_cpu(rw->reftag);
    uint16_t status;

    trace_pci_nvme_dif_rw_check_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
                                   appmask, reftag);

    if (ret) {
        goto out;
    }

    status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce, ctx->mdata.iov.size,
                                   slba);
    if (status) {
        req->status = status;
        goto out;
    }

    status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                            ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
                            slba, apptag, appmask, reftag);
    if (status) {
        req->status = status;
        goto out;
    }

    status = nvme_bounce_data(n, ctx->data.bounce, ctx->data.iov.size,
                              NVME_TX_DIRECTION_FROM_DEVICE, req);
    if (status) {
        req->status = status;
        goto out;
    }

    if (ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8) {
        goto out;
    }

    status = nvme_bounce_mdata(n, ctx->mdata.bounce, ctx->mdata.iov.size,
                               NVME_TX_DIRECTION_FROM_DEVICE, req);
    if (status) {
        req->status = status;
    }

out:
    nvme_dif_rw_cb(ctx, ret);
}

static void nvme_dif_rw_mdata_in_cb(void *opaque, int ret)
{
    NvmeBounceContext *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
    size_t mlen = nvme_m2b(ns, nlb);
    uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
    BlockBackend *blk = ns->blkconf.blk;

    trace_pci_nvme_dif_rw_mdata_in_cb(nvme_cid(req), blk_name(blk));

    if (ret) {
        goto out;
    }

    ctx->mdata.bounce = g_malloc(mlen);

    qemu_iovec_reset(&ctx->mdata.iov);
    qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);

    req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
                                nvme_dif_rw_check_cb, ctx);
    return;

out:
    nvme_dif_rw_cb(ctx, ret);
}

static void nvme_dif_rw_mdata_out_cb(void *opaque, int ret)
{
    NvmeBounceContext *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
    BlockBackend *blk = ns->blkconf.blk;

    trace_pci_nvme_dif_rw_mdata_out_cb(nvme_cid(req), blk_name(blk));

    if (ret) {
        goto out;
    }

    req->aiocb = blk_aio_pwritev(blk, offset, &ctx->mdata.iov, 0,
                                 nvme_dif_rw_cb, ctx);
    return;

out:
    nvme_dif_rw_cb(ctx, ret);
}

uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
{
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    NvmeNamespace *ns = req->ns;
    BlockBackend *blk = ns->blkconf.blk;
    bool wrz = rw->opcode == NVME_CMD_WRITE_ZEROES;
    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
    uint64_t slba = le64_to_cpu(rw->slba);
    size_t len = nvme_l2b(ns, nlb);
    size_t mlen = nvme_m2b(ns, nlb);
    size_t mapped_len = len;
    int64_t offset = nvme_l2b(ns, slba);
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint16_t apptag = le16_to_cpu(rw->apptag);
    uint16_t appmask = le16_to_cpu(rw->appmask);
    uint32_t reftag = le32_to_cpu(rw->reftag);
    bool pract = !!(ctrl & NVME_RW_PRINFO_PRACT);
    NvmeBounceContext *ctx;
    uint16_t status;

    trace_pci_nvme_dif_rw(pract, NVME_RW_PRINFO(ctrl));

    ctx = g_new0(NvmeBounceContext, 1);
    ctx->req = req;

    if (wrz) {
        BdrvRequestFlags flags = BDRV_REQ_MAY_UNMAP;

        if (ctrl & NVME_RW_PRINFO_PRCHK_MASK) {
            status = NVME_INVALID_PROT_INFO | NVME_DNR;
            goto err;
        }

        if (pract) {
            uint8_t *mbuf, *end;
            size_t msize = nvme_msize(ns);
            int16_t pil = msize - sizeof(NvmeDifTuple);

            status = nvme_check_prinfo(ns, ctrl, slba, reftag);
            if (status) {
                goto err;
            }

            flags = 0;

            ctx->mdata.bounce = g_malloc0(mlen);

            qemu_iovec_init(&ctx->mdata.iov, 1);
            qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);

            mbuf = ctx->mdata.bounce;
            end = mbuf + mlen;

            if (ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT) {
                pil = 0;
            }

            for (; mbuf < end; mbuf += msize) {
                NvmeDifTuple *dif = (NvmeDifTuple *)(mbuf + pil);

                dif->apptag = cpu_to_be16(apptag);
                dif->reftag = cpu_to_be32(reftag);

                switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
                case NVME_ID_NS_DPS_TYPE_1:
                case NVME_ID_NS_DPS_TYPE_2:
                    reftag++;
                }
            }
        }

        req->aiocb = blk_aio_pwrite_zeroes(blk, offset, len, flags,
                                           nvme_dif_rw_mdata_out_cb, ctx);
        return NVME_NO_COMPLETE;
    }

    if (nvme_ns_ext(ns) && !(pract && nvme_msize(ns) == 8)) {
        mapped_len += mlen;
    }

    status = nvme_map_dptr(n, &req->sg, mapped_len, &req->cmd);
    if (status) {
        return status;
    }

    ctx->data.bounce = g_malloc(len);

    qemu_iovec_init(&ctx->data.iov, 1);
    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);

    if (req->cmd.opcode == NVME_CMD_READ) {
        block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
                         BLOCK_ACCT_READ);

        req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
                                    nvme_dif_rw_mdata_in_cb, ctx);
        return NVME_NO_COMPLETE;
    }

    status = nvme_bounce_data(n, ctx->data.bounce, ctx->data.iov.size,
                              NVME_TX_DIRECTION_TO_DEVICE, req);
    if (status) {
        goto err;
    }

    ctx->mdata.bounce = g_malloc(mlen);

    qemu_iovec_init(&ctx->mdata.iov, 1);
    qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);

    if (!(pract && nvme_msize(ns) == 8)) {
        status = nvme_bounce_mdata(n, ctx->mdata.bounce, ctx->mdata.iov.size,
                                   NVME_TX_DIRECTION_TO_DEVICE, req);
        if (status) {
            goto err;
        }
    }

    status = nvme_check_prinfo(ns, ctrl, slba, reftag);
    if (status) {
        goto err;
    }

    if (pract) {
        /* splice generated protection information into the buffer */
        nvme_dif_pract_generate_dif(ns, ctx->data.bounce, ctx->data.iov.size,
                                    ctx->mdata.bounce, ctx->mdata.iov.size,
                                    apptag, reftag);
    } else {
        status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
                                slba, apptag, appmask, reftag);
        if (status) {
            goto err;
        }
    }

    block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
                     BLOCK_ACCT_WRITE);

    req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &ctx->data.iov, 0,
                                 nvme_dif_rw_mdata_out_cb, ctx);

    return NVME_NO_COMPLETE;

err:
    qemu_iovec_destroy(&ctx->data.iov);
    g_free(ctx->data.bounce);

    qemu_iovec_destroy(&ctx->mdata.iov);
    g_free(ctx->mdata.bounce);

    g_free(ctx);

    return status;
}

A hw/block/nvme-dif.h => hw/block/nvme-dif.h +53 -0
@@ 0,0 1,53 @@
#ifndef HW_NVME_DIF_H
#define HW_NVME_DIF_H

/* from Linux kernel (crypto/crct10dif_common.c) */
static const uint16_t t10_dif_crc_table[256] = {
    0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
    0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
    0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
    0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
    0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
    0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
    0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
    0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
    0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
    0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
    0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
    0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
    0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
    0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
    0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
    0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
    0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
    0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
    0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
    0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
    0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
    0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
    0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
    0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
    0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
    0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
    0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
    0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
    0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
    0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
    0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
    0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
};

uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint16_t ctrl, uint64_t slba,
                           uint32_t reftag);
uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
                               uint64_t slba);
void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
                                 uint8_t *mbuf, size_t mlen, uint16_t apptag,
                                 uint32_t reftag);
uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
                        uint8_t *mbuf, size_t mlen, uint16_t ctrl,
                        uint64_t slba, uint16_t apptag,
                        uint16_t appmask, uint32_t reftag);
uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req);

#endif /* HW_NVME_DIF_H */

M hw/block/nvme-ns.c => hw/block/nvme-ns.c +101 -23
@@ 32,36 32,46 @@

#define MIN_DISCARD_GRANULARITY (4 * KiB)

static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
void nvme_ns_init_format(NvmeNamespace *ns)
{
    BlockDriverInfo bdi;
    NvmeIdNs *id_ns = &ns->id_ns;
    int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
    int npdg;

    ns->id_ns.dlfeat = 0x9;
    BlockDriverInfo bdi;
    int npdg, nlbas, ret;

    id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
    nlbas = nvme_ns_nlbas(ns);

    id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));

    ns->csi = NVME_CSI_NVM;
    id_ns->nsze = cpu_to_le64(nlbas);

    /* no thin provisioning */
    id_ns->ncap = id_ns->nsze;
    id_ns->nuse = id_ns->ncap;

    /* support DULBE and I/O optimization fields */
    id_ns->nsfeat |= (0x4 | 0x10);
    ns->mdata_offset = nvme_l2b(ns, nlbas);

    npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
    npdg = ns->blkconf.discard_granularity / nvme_lsize(ns);

    if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 &&
        bdi.cluster_size > ns->blkconf.discard_granularity) {
        npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
    ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi);
    if (ret >= 0 && bdi.cluster_size > ns->blkconf.discard_granularity) {
        npdg = bdi.cluster_size / nvme_lsize(ns);
    }

    id_ns->npda = id_ns->npdg = npdg - 1;
}

static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
{
    NvmeIdNs *id_ns = &ns->id_ns;
    uint8_t ds;
    uint16_t ms;
    int i;

    ns->csi = NVME_CSI_NVM;
    ns->status = 0x0;

    ns->id_ns.dlfeat = 0x1;

    /* support DULBE and I/O optimization fields */
    id_ns->nsfeat |= (0x4 | 0x10);

    if (nvme_ns_shared(ns)) {
        id_ns->nmic |= NVME_NMIC_NS_SHARED;


@@ 72,6 82,61 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
    id_ns->mcl = cpu_to_le32(ns->params.mcl);
    id_ns->msrc = ns->params.msrc;

    ds = 31 - clz32(ns->blkconf.logical_block_size);
    ms = ns->params.ms;

    if (ns->params.ms) {
        id_ns->mc = 0x3;

        if (ns->params.mset) {
            id_ns->flbas |= 0x10;
        }

        id_ns->dpc = 0x1f;
        id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi;

        NvmeLBAF lbaf[16] = {
            [0] = { .ds =  9           },
            [1] = { .ds =  9, .ms =  8 },
            [2] = { .ds =  9, .ms = 16 },
            [3] = { .ds =  9, .ms = 64 },
            [4] = { .ds = 12           },
            [5] = { .ds = 12, .ms =  8 },
            [6] = { .ds = 12, .ms = 16 },
            [7] = { .ds = 12, .ms = 64 },
        };

        memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf));
        id_ns->nlbaf = 7;
    } else {
        NvmeLBAF lbaf[16] = {
            [0] = { .ds =  9 },
            [1] = { .ds = 12 },
        };

        memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf));
        id_ns->nlbaf = 1;
    }

    for (i = 0; i <= id_ns->nlbaf; i++) {
        NvmeLBAF *lbaf = &id_ns->lbaf[i];
        if (lbaf->ds == ds) {
            if (lbaf->ms == ms) {
                id_ns->flbas |= i;
                goto lbaf_found;
            }
        }
    }

    /* add non-standard lba format */
    id_ns->nlbaf++;
    id_ns->lbaf[id_ns->nlbaf].ds = ds;
    id_ns->lbaf[id_ns->nlbaf].ms = ms;
    id_ns->flbas |= id_ns->nlbaf;

lbaf_found:
    nvme_ns_init_format(ns);

    return 0;
}



@@ 105,7 170,7 @@ static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
{
    uint64_t zone_size, zone_cap;
    uint32_t lbasz = ns->blkconf.logical_block_size;
    uint32_t lbasz = nvme_lsize(ns);

    /* Make sure that the values of ZNS properties are sane */
    if (ns->params.zone_size_bs) {


@@ 140,7 205,7 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
     */
    ns->zone_size = zone_size / lbasz;
    ns->zone_capacity = zone_cap / lbasz;
    ns->num_zones = ns->size / lbasz / ns->zone_size;
    ns->num_zones = nvme_ns_nlbas(ns) / ns->zone_size;

    /* Do a few more sanity checks of ZNS properties */
    if (!ns->num_zones) {


@@ 229,9 294,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
    }
}

static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index)
static void nvme_ns_init_zoned(NvmeNamespace *ns)
{
    NvmeIdNsZoned *id_ns_z;
    int i;

    nvme_ns_zoned_init_state(ns);



@@ 243,9 309,11 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index)
    id_ns_z->zoc = 0;
    id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;

    id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
    id_ns_z->lbafe[lba_index].zdes =
        ns->params.zd_extension_size >> 6; /* Units of 64B */
    for (i = 0; i <= ns->id_ns.nlbaf; i++) {
        id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size);
        id_ns_z->lbafe[i].zdes =
            ns->params.zd_extension_size >> 6; /* Units of 64B */
    }

    ns->csi = NVME_CSI_ZONED;
    ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);


@@ 326,6 394,12 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
        return -1;
    }

    if (ns->params.pi && !ns->params.ms) {
        error_setg(errp, "at least 8 bytes of metadata required to enable "
                   "protection information");
        return -1;
    }

    return 0;
}



@@ 346,7 420,7 @@ int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
        if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
            return -1;
        }
        nvme_ns_init_zoned(ns, 0);
        nvme_ns_init_zoned(ns);
    }

    return 0;


@@ 402,6 476,10 @@ static Property nvme_ns_props[] = {
    DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
    DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
    DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
    DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0),
    DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0),
    DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0),
    DEFINE_PROP_UINT8("pil", NvmeNamespace, params.pil, 0),
    DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
    DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128),
    DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127),

M hw/block/nvme-ns.h => hw/block/nvme-ns.h +44 -6
@@ 15,6 15,8 @@
#ifndef NVME_NS_H
#define NVME_NS_H

#include "qemu/uuid.h"

#define TYPE_NVME_NS "nvme-ns"
#define NVME_NS(obj) \
    OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)


@@ 30,6 32,11 @@ typedef struct NvmeNamespaceParams {
    uint32_t nsid;
    QemuUUID uuid;

    uint16_t ms;
    uint8_t  mset;
    uint8_t  pi;
    uint8_t  pil;

    uint16_t mssrl;
    uint32_t mcl;
    uint8_t  msrc;


@@ 48,9 55,11 @@ typedef struct NvmeNamespace {
    BlockConf    blkconf;
    int32_t      bootindex;
    int64_t      size;
    int64_t      mdata_offset;
    NvmeIdNs     id_ns;
    const uint32_t *iocs;
    uint8_t      csi;
    uint16_t     status;

    NvmeSubsystem   *subsys;
    QTAILQ_ENTRY(NvmeNamespace) entry;


@@ 76,6 85,11 @@ typedef struct NvmeNamespace {
    } features;
} NvmeNamespace;

static inline uint16_t nvme_ns_status(NvmeNamespace *ns)
{
    return ns->status;
}

static inline uint32_t nvme_nsid(NvmeNamespace *ns)
{
    if (ns) {


@@ 101,18 115,41 @@ static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
    return nvme_ns_lbaf(ns)->ds;
}

/* calculate the number of LBAs that the namespace can accomodate */
static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
{
    return ns->size >> nvme_ns_lbads(ns);
}

/* convert an LBA to the equivalent in bytes */
static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
{
    return lba << nvme_ns_lbads(ns);
}

static inline size_t nvme_lsize(NvmeNamespace *ns)
{
    return 1 << nvme_ns_lbads(ns);
}

static inline uint16_t nvme_msize(NvmeNamespace *ns)
{
    return nvme_ns_lbaf(ns)->ms;
}

static inline size_t nvme_m2b(NvmeNamespace *ns, uint64_t lba)
{
    return nvme_msize(ns) * lba;
}

static inline bool nvme_ns_ext(NvmeNamespace *ns)
{
    return !!NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas);
}

/* calculate the number of LBAs that the namespace can accomodate */
static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
{
    if (nvme_msize(ns)) {
        return ns->size / (nvme_lsize(ns) + nvme_msize(ns));
    }
    return ns->size >> nvme_ns_lbads(ns);
}

typedef struct NvmeCtrl NvmeCtrl;

static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)


@@ 187,6 224,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
    assert(ns->nr_active_zones >= 0);
}

void nvme_ns_init_format(NvmeNamespace *ns);
int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
void nvme_ns_drain(NvmeNamespace *ns);
void nvme_ns_shutdown(NvmeNamespace *ns);

M hw/block/nvme-subsys.c => hw/block/nvme-subsys.c +5 -2
@@ 47,15 47,18 @@ int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp)
{
    NvmeSubsystem *subsys = ns->subsys;
    NvmeCtrl *n;
    uint32_t nsid = nvme_nsid(ns);
    int i;

    if (subsys->namespaces[nvme_nsid(ns)]) {
    assert(nsid && nsid <= NVME_SUBSYS_MAX_NAMESPACES);

    if (subsys->namespaces[nsid]) {
        error_setg(errp, "namespace %d already registerd to subsy %s",
                   nvme_nsid(ns), subsys->parent_obj.id);
        return -1;
    }

    subsys->namespaces[nvme_nsid(ns)] = ns;
    subsys->namespaces[nsid] = ns;

    for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) {
        n = subsys->ctrls[i];

M hw/block/nvme-subsys.h => hw/block/nvme-subsys.h +2 -0
@@ 54,6 54,8 @@ static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
        return NULL;
    }

    assert(nsid && nsid <= NVME_SUBSYS_MAX_NAMESPACES);

    return subsys->namespaces[nsid];
}


M hw/block/nvme.c => hw/block/nvme.c +1146 -125
@@ 23,7 23,8 @@
 *              [pmrdev=<mem_backend_file_id>,] \
 *              max_ioqpairs=<N[optional]>, \
 *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
 *              mdts=<N[optional]>,zoned.zasl=<N[optional]>, \
 *              mdts=<N[optional]>,vsl=<N[optional]>, \
 *              zoned.zasl=<N[optional]>, \
 *              subsys=<subsys_id>
 *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
 *              zoned=<true|false[optional]>, \


@@ 78,12 79,26 @@
 *   as a power of two (2^n) and is in units of the minimum memory page size
 *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
 *
 * - `vsl`
 *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
 *   this value is specified as a power of two (2^n) and is in units of the
 *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
 *   KiB).
 *
 * - `zoned.zasl`
 *   Indicates the maximum data transfer size for the Zone Append command. Like
 *   `mdts`, the value is specified as a power of two (2^n) and is in units of
 *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
 *   defaulting to the value of `mdts`).
 *
 * - `zoned.append_size_limit`
 *   The maximum I/O size in bytes that is allowed in Zone Append command.
 *   The default is 128KiB. Since internally this this value is maintained as
 *   ZASL = log2(<maximum append size> / <page size>), some values assigned
 *   to this property may be rounded down and result in a lower maximum ZA
 *   data size being in effect. By setting this property to 0, users can make
 *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
 *
 * nvme namespace device parameters
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * - `subsys`


@@ 144,6 159,7 @@
#include "trace.h"
#include "nvme.h"
#include "nvme-ns.h"
#include "nvme-dif.h"

#define NVME_MAX_IOQPAIRS 0xffff
#define NVME_DB_SIZE  4


@@ 197,6 213,7 @@ static const uint32_t nvme_cse_acs[256] = {
    [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
    [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
    [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
    [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
};

static const uint32_t nvme_cse_iocs_none[256];


@@ 207,6 224,7 @@ static const uint32_t nvme_cse_iocs_nvm[256] = {
    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
};


@@ 217,6 235,7 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
    [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,


@@ 226,15 245,6 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {

static void nvme_process_sq(void *opaque);

static uint16_t nvme_cid(NvmeRequest *req)
{
    if (!req) {
        return 0xffff;
    }

    return le16_to_cpu(req->cqe.cid);
}

static uint16_t nvme_sqid(NvmeRequest *req)
{
    return le16_to_cpu(req->sq->sqid);


@@ 360,6 370,26 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
    return pci_dma_read(&n->parent_obj, addr, buf, size);
}

static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
{
    hwaddr hi = addr + size - 1;
    if (hi < addr) {
        return 1;
    }

    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
        memcpy(nvme_addr_to_cmb(n, addr), buf, size);
        return 0;
    }

    if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
        memcpy(nvme_addr_to_pmr(n, addr), buf, size);
        return 0;
    }

    return pci_dma_write(&n->parent_obj, addr, buf, size);
}

static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
{
    return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces);


@@ 476,6 506,59 @@ static inline void nvme_sg_unmap(NvmeSg *sg)
    memset(sg, 0x0, sizeof(*sg));
}

/*
 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
 * holds both data and metadata. This function splits the data and metadata
 * into two separate QSG/IOVs.
 */
static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
                          NvmeSg *mdata)
{
    NvmeSg *dst = data;
    size_t size = nvme_lsize(ns);
    size_t msize = nvme_msize(ns);
    uint32_t trans_len, count = size;
    uint64_t offset = 0;
    bool dma = sg->flags & NVME_SG_DMA;
    size_t sge_len;
    size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
    int sg_idx = 0;

    assert(sg->flags & NVME_SG_ALLOC);

    while (sg_len) {
        sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;

        trans_len = MIN(sg_len, count);
        trans_len = MIN(trans_len, sge_len - offset);

        if (dst) {
            if (dma) {
                qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
                                trans_len);
            } else {
                qemu_iovec_add(&dst->iov,
                               sg->iov.iov[sg_idx].iov_base + offset,
                               trans_len);
            }
        }

        sg_len -= trans_len;
        count -= trans_len;
        offset += trans_len;

        if (count == 0) {
            dst = (dst == data) ? mdata : data;
            count = (dst == data) ? size : msize;
        }

        if (sge_len == offset) {
            offset = 0;
            sg_idx++;
        }
    }
}

static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
                                  size_t len)
{


@@ 860,8 943,8 @@ unmap:
    return status;
}

static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
                              NvmeCmd *cmd)
uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
                       NvmeCmd *cmd)
{
    uint64_t prp1, prp2;



@@ 879,10 962,158 @@ static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
    }
}

typedef enum NvmeTxDirection {
    NVME_TX_DIRECTION_TO_DEVICE   = 0,
    NVME_TX_DIRECTION_FROM_DEVICE = 1,
} NvmeTxDirection;
static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
                              NvmeCmd *cmd)
{
    int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
    hwaddr mptr = le64_to_cpu(cmd->mptr);
    uint16_t status;

    if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
        NvmeSglDescriptor sgl;

        if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
            return NVME_DATA_TRAS_ERROR;
        }

        status = nvme_map_sgl(n, sg, sgl, len, cmd);
        if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
            status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
        }

        return status;
    }

    nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
    status = nvme_map_addr(n, sg, mptr, len);
    if (status) {
        nvme_sg_unmap(sg);
    }

    return status;
}

static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
{
    NvmeNamespace *ns = req->ns;
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint16_t ctrl = le16_to_cpu(rw->control);
    size_t len = nvme_l2b(ns, nlb);
    uint16_t status;

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
        (ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
        goto out;
    }

    if (nvme_ns_ext(ns)) {
        NvmeSg sg;

        len += nvme_m2b(ns, nlb);

        status = nvme_map_dptr(n, &sg, len, &req->cmd);
        if (status) {
            return status;
        }

        nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
        nvme_sg_split(&sg, ns, &req->sg, NULL);
        nvme_sg_unmap(&sg);

        return NVME_SUCCESS;
    }

out:
    return nvme_map_dptr(n, &req->sg, len, &req->cmd);
}

static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
{
    NvmeNamespace *ns = req->ns;
    size_t len = nvme_m2b(ns, nlb);
    uint16_t status;

    if (nvme_ns_ext(ns)) {
        NvmeSg sg;

        len += nvme_l2b(ns, nlb);

        status = nvme_map_dptr(n, &sg, len, &req->cmd);
        if (status) {
            return status;
        }

        nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
        nvme_sg_split(&sg, ns, NULL, &req->sg);
        nvme_sg_unmap(&sg);

        return NVME_SUCCESS;
    }

    return nvme_map_mptr(n, &req->sg, len, &req->cmd);
}

static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
                                    uint32_t len, uint32_t bytes,
                                    int32_t skip_bytes, int64_t offset,
                                    NvmeTxDirection dir)
{
    hwaddr addr;
    uint32_t trans_len, count = bytes;
    bool dma = sg->flags & NVME_SG_DMA;
    int64_t sge_len;
    int sg_idx = 0;
    int ret;

    assert(sg->flags & NVME_SG_ALLOC);

    while (len) {
        sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;

        if (sge_len - offset < 0) {
            offset -= sge_len;
            sg_idx++;
            continue;
        }

        if (sge_len == offset) {
            offset = 0;
            sg_idx++;
            continue;
        }

        trans_len = MIN(len, count);
        trans_len = MIN(trans_len, sge_len - offset);

        if (dma) {
            addr = sg->qsg.sg[sg_idx].base + offset;
        } else {
            addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
        }

        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
            ret = nvme_addr_read(n, addr, ptr, trans_len);
        } else {
            ret = nvme_addr_write(n, addr, ptr, trans_len);
        }

        if (ret) {
            return NVME_DATA_TRAS_ERROR;
        }

        ptr += trans_len;
        len -= trans_len;
        count -= trans_len;
        offset += trans_len;

        if (count == 0) {
            count = bytes;
            offset += skip_bytes;
        }
    }

    return NVME_SUCCESS;
}

static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
                        NvmeTxDirection dir)


@@ 946,6 1177,49 @@ static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
    return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
}

uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
                          NvmeTxDirection dir, NvmeRequest *req)
{
    NvmeNamespace *ns = req->ns;
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint16_t ctrl = le16_to_cpu(rw->control);

    if (nvme_ns_ext(ns) &&
        !(ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
        size_t lsize = nvme_lsize(ns);
        size_t msize = nvme_msize(ns);

        return nvme_tx_interleaved(n, &req->sg, ptr, len, lsize, msize, 0,
                                   dir);
    }

    return nvme_tx(n, &req->sg, ptr, len, dir);
}

uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
                           NvmeTxDirection dir, NvmeRequest *req)
{
    NvmeNamespace *ns = req->ns;
    uint16_t status;

    if (nvme_ns_ext(ns)) {
        size_t lsize = nvme_lsize(ns);
        size_t msize = nvme_msize(ns);

        return nvme_tx_interleaved(n, &req->sg, ptr, len, msize, lsize, lsize,
                                   dir);
    }

    nvme_sg_unmap(&req->sg);

    status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
    if (status) {
        return status;
    }

    return nvme_tx(n, &req->sg, ptr, len, dir);
}

static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
                                 BlockCompletionFunc *cb, NvmeRequest *req)
{


@@ 1498,7 1772,7 @@ static inline bool nvme_is_write(NvmeRequest *req)
           rw->opcode == NVME_CMD_WRITE_ZEROES;
}

static void nvme_rw_cb(void *opaque, int ret)
static void nvme_misc_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeNamespace *ns = req->ns;


@@ 1507,19 1781,125 @@ static void nvme_rw_cb(void *opaque, int ret)
    BlockAcctCookie *acct = &req->acct;
    BlockAcctStats *stats = blk_get_stats(blk);

    trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
    trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));

    if (ret) {
        block_acct_failed(stats, acct);
        nvme_aio_err(req, ret);
    } else {
        block_acct_done(stats, acct);
    }

    nvme_enqueue_req_completion(nvme_cq(req), req);
}

void nvme_rw_complete_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeNamespace *ns = req->ns;
    BlockBackend *blk = ns->blkconf.blk;
    BlockAcctCookie *acct = &req->acct;
    BlockAcctStats *stats = blk_get_stats(blk);

    trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));

    if (ret) {
        block_acct_failed(stats, acct);
        nvme_aio_err(req, ret);
    } else {
        block_acct_done(stats, acct);
    }

    if (ns->params.zoned && nvme_is_write(req)) {
        nvme_finalize_zoned_write(ns, req);
    }

    if (!ret) {
        block_acct_done(stats, acct);
    } else {
        block_acct_failed(stats, acct);
    nvme_enqueue_req_completion(nvme_cq(req), req);
}

static void nvme_rw_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeNamespace *ns = req->ns;

    BlockBackend *blk = ns->blkconf.blk;

    trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));

    if (ret) {
        goto out;
    }

    if (nvme_msize(ns)) {
        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
        uint64_t slba = le64_to_cpu(rw->slba);
        uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
        uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);

        if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
            size_t mlen = nvme_m2b(ns, nlb);

            req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
                                               BDRV_REQ_MAY_UNMAP,
                                               nvme_rw_complete_cb, req);
            return;
        }

        if (nvme_ns_ext(ns) || req->cmd.mptr) {
            uint16_t status;

            nvme_sg_unmap(&req->sg);
            status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
            if (status) {
                ret = -EFAULT;
                goto out;
            }

            if (req->cmd.opcode == NVME_CMD_READ) {
                return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
            }

            return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
        }
    }

out:
    nvme_rw_complete_cb(req, ret);
}

struct nvme_aio_format_ctx {
    NvmeRequest   *req;
    NvmeNamespace *ns;

    /* number of outstanding write zeroes for this namespace */
    int *count;
};

static void nvme_aio_format_cb(void *opaque, int ret)
{
    struct nvme_aio_format_ctx *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = ctx->ns;
    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
    int *count = ctx->count;

    g_free(ctx);

    if (ret) {
        nvme_aio_err(req, ret);
    }

    if (--(*count)) {
        return;
    }

    g_free(count);
    ns->status = 0x0;

    if (--(*num_formats)) {
        return;
    }

    nvme_enqueue_req_completion(nvme_cq(req), req);
}



@@ 1558,6 1938,90 @@ static void nvme_aio_flush_cb(void *opaque, int ret)
    nvme_enqueue_req_completion(nvme_cq(req), req);
}

static void nvme_verify_cb(void *opaque, int ret)
{
    NvmeBounceContext *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    BlockBackend *blk = ns->blkconf.blk;
    BlockAcctCookie *acct = &req->acct;
    BlockAcctStats *stats = blk_get_stats(blk);
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint16_t apptag = le16_to_cpu(rw->apptag);
    uint16_t appmask = le16_to_cpu(rw->appmask);
    uint32_t reftag = le32_to_cpu(rw->reftag);
    uint16_t status;

    trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
                             appmask, reftag);

    if (ret) {
        block_acct_failed(stats, acct);
        nvme_aio_err(req, ret);
        goto out;
    }

    block_acct_done(stats, acct);

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
        status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
                                       ctx->mdata.iov.size, slba);
        if (status) {
            req->status = status;
            goto out;
        }

        req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                     ctx->mdata.bounce, ctx->mdata.iov.size,
                                     ctrl, slba, apptag, appmask, reftag);
    }

out:
    qemu_iovec_destroy(&ctx->data.iov);
    g_free(ctx->data.bounce);

    qemu_iovec_destroy(&ctx->mdata.iov);
    g_free(ctx->mdata.bounce);

    g_free(ctx);

    nvme_enqueue_req_completion(nvme_cq(req), req);
}


static void nvme_verify_mdata_in_cb(void *opaque, int ret)
{
    NvmeBounceContext *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
    size_t mlen = nvme_m2b(ns, nlb);
    uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
    BlockBackend *blk = ns->blkconf.blk;

    trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));

    if (ret) {
        goto out;
    }

    ctx->mdata.bounce = g_malloc(mlen);

    qemu_iovec_reset(&ctx->mdata.iov);
    qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);

    req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
                                nvme_verify_cb, ctx);
    return;

out:
    nvme_verify_cb(ctx, ret);
}

static void nvme_aio_discard_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;


@@ 1583,7 2047,7 @@ struct nvme_zone_reset_ctx {
    NvmeZone    *zone;
};

static void nvme_aio_zone_reset_cb(void *opaque, int ret)
static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
{
    struct nvme_zone_reset_ctx *ctx = opaque;
    NvmeRequest *req = ctx->req;


@@ 1591,31 2055,31 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret)
    NvmeZone *zone = ctx->zone;
    uintptr_t *resets = (uintptr_t *)&req->opaque;

    g_free(ctx);

    trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);

    if (!ret) {
        switch (nvme_get_zone_state(zone)) {
        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
            nvme_aor_dec_open(ns);
            /* fall through */
        case NVME_ZONE_STATE_CLOSED:
            nvme_aor_dec_active(ns);
            /* fall through */
        case NVME_ZONE_STATE_FULL:
            zone->w_ptr = zone->d.zslba;
            zone->d.wp = zone->w_ptr;
            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
            /* fall through */
        default:
            break;
        }
    } else {
    if (ret) {
        nvme_aio_err(req, ret);
        goto out;
    }

    switch (nvme_get_zone_state(zone)) {
    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
        nvme_aor_dec_open(ns);
        /* fall through */
    case NVME_ZONE_STATE_CLOSED:
        nvme_aor_dec_active(ns);
        /* fall through */
    case NVME_ZONE_STATE_FULL:
        zone->w_ptr = zone->d.zslba;
        zone->d.wp = zone->w_ptr;
        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
        /* fall through */
    default:
        break;
    }

out:
    g_free(ctx);

    (*resets)--;

    if (*resets) {


@@ 1625,25 2089,61 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret)
    nvme_enqueue_req_completion(nvme_cq(req), req);
}

static void nvme_aio_zone_reset_cb(void *opaque, int ret)
{
    struct nvme_zone_reset_ctx *ctx = opaque;
    NvmeRequest *req = ctx->req;
    NvmeNamespace *ns = req->ns;
    NvmeZone *zone = ctx->zone;

    trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);

    if (ret) {
        goto out;
    }

    if (nvme_msize(ns)) {
        int64_t offset = ns->mdata_offset + nvme_m2b(ns, zone->d.zslba);

        blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
                              nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
                              nvme_aio_zone_reset_complete_cb, ctx);
        return;
    }

out:
    nvme_aio_zone_reset_complete_cb(opaque, ret);
}

struct nvme_copy_ctx {
    int copies;
    uint8_t *bounce;
    uint8_t *mbounce;
    uint32_t nlb;
    NvmeCopySourceRange *ranges;
};

struct nvme_copy_in_ctx {
    NvmeRequest *req;
    QEMUIOVector iov;
    NvmeCopySourceRange *range;
};

static void nvme_copy_cb(void *opaque, int ret)
static void nvme_copy_complete_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeNamespace *ns = req->ns;
    struct nvme_copy_ctx *ctx = req->opaque;

    trace_pci_nvme_copy_cb(nvme_cid(req));
    if (ret) {
        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
        nvme_aio_err(req, ret);
        goto out;
    }

    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);

out:
    if (ns->params.zoned) {
        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
        uint64_t sdlba = le64_to_cpu(copy->sdlba);


@@ 1652,19 2152,42 @@ static void nvme_copy_cb(void *opaque, int ret)
        __nvme_advance_zone_wp(ns, zone, ctx->nlb);
    }

    if (!ret) {
        block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
    } else {
        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
        nvme_aio_err(req, ret);
    }

    g_free(ctx->bounce);
    g_free(ctx->mbounce);
    g_free(ctx);

    nvme_enqueue_req_completion(nvme_cq(req), req);
}

static void nvme_copy_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeNamespace *ns = req->ns;
    struct nvme_copy_ctx *ctx = req->opaque;

    trace_pci_nvme_copy_cb(nvme_cid(req));

    if (ret) {
        goto out;
    }

    if (nvme_msize(ns)) {
        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
        uint64_t sdlba = le64_to_cpu(copy->sdlba);
        int64_t offset = ns->mdata_offset + nvme_m2b(ns, sdlba);

        qemu_iovec_reset(&req->sg.iov);
        qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));

        req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
                                     nvme_copy_complete_cb, req);
        return;
    }

out:
    nvme_copy_complete_cb(opaque, ret);
}

static void nvme_copy_in_complete(NvmeRequest *req)
{
    NvmeNamespace *ns = req->ns;


@@ 1677,6 2200,70 @@ static void nvme_copy_in_complete(NvmeRequest *req)

    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
        uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
        uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
        uint16_t nr = copy->nr + 1;
        NvmeCopySourceRange *range;
        uint64_t slba;
        uint32_t nlb;
        uint16_t apptag, appmask;
        uint32_t reftag;
        uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
        size_t len, mlen;
        int i;

        /*
         * The dif helpers expects prinfo to be similar to the control field of
         * the NvmeRwCmd, so shift by 10 to fake it.
         */
        prinfor = prinfor << 10;
        prinfow = prinfow << 10;

        for (i = 0; i < nr; i++) {
            range = &ctx->ranges[i];
            slba = le64_to_cpu(range->slba);
            nlb = le16_to_cpu(range->nlb) + 1;
            len = nvme_l2b(ns, nlb);
            mlen = nvme_m2b(ns, nlb);
            apptag = le16_to_cpu(range->apptag);
            appmask = le16_to_cpu(range->appmask);
            reftag = le32_to_cpu(range->reftag);

            status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
                                    apptag, appmask, reftag);
            if (status) {
                goto invalid;
            }

            buf += len;
            mbuf += mlen;
        }

        apptag = le16_to_cpu(copy->apptag);
        appmask = le16_to_cpu(copy->appmask);
        reftag = le32_to_cpu(copy->reftag);

        if (prinfow & NVME_RW_PRINFO_PRACT) {
            size_t len = nvme_l2b(ns, ctx->nlb);
            size_t mlen = nvme_m2b(ns, ctx->nlb);

            status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
            if (status) {
                goto invalid;
            }

            nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
                                        mlen, apptag, reftag);
        } else {
            status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
                                    prinfow, sdlba, apptag, appmask, reftag);
            if (status) {
                goto invalid;
            }
        }
    }

    status = nvme_check_bounds(ns, sdlba, ctx->nlb);
    if (status) {
        trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);


@@ 1731,68 2318,176 @@ static void nvme_aio_copy_in_cb(void *opaque, int ret)

    trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));

    if (ret) {
        nvme_aio_err(req, ret);
    }
    if (ret) {
        nvme_aio_err(req, ret);
    }

    ctx->copies--;

    if (ctx->copies) {
        return;
    }

    if (req->status) {
        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);

        g_free(ctx->bounce);
        g_free(ctx->mbounce);
        g_free(ctx);

        nvme_enqueue_req_completion(nvme_cq(req), req);

        return;
    }

    nvme_copy_in_complete(req);
}

struct nvme_compare_ctx {
    struct {
        QEMUIOVector iov;
        uint8_t *bounce;
    } data;

    struct {
        QEMUIOVector iov;
        uint8_t *bounce;
    } mdata;
};

static void nvme_compare_mdata_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeNamespace *ns = req->ns;
    NvmeCtrl *n = nvme_ctrl(req);
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint16_t apptag = le16_to_cpu(rw->apptag);
    uint16_t appmask = le16_to_cpu(rw->appmask);
    uint32_t reftag = le32_to_cpu(rw->reftag);
    struct nvme_compare_ctx *ctx = req->opaque;
    g_autofree uint8_t *buf = NULL;
    uint16_t status = NVME_SUCCESS;

    trace_pci_nvme_compare_mdata_cb(nvme_cid(req));

    buf = g_malloc(ctx->mdata.iov.size);

    status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
                               NVME_TX_DIRECTION_TO_DEVICE, req);
    if (status) {
        req->status = status;
        goto out;
    }

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
        uint64_t slba = le64_to_cpu(rw->slba);
        uint8_t *bufp;
        uint8_t *mbufp = ctx->mdata.bounce;
        uint8_t *end = mbufp + ctx->mdata.iov.size;
        size_t msize = nvme_msize(ns);
        int16_t pil = 0;

        status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
                                slba, apptag, appmask, reftag);
        if (status) {
            req->status = status;
            goto out;
        }

        /*
         * When formatted with protection information, do not compare the DIF
         * tuple.
         */
        if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
            pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
        }

    ctx->copies--;
        for (bufp = buf; mbufp < end; bufp += msize, mbufp += msize) {
            if (memcmp(bufp + pil, mbufp + pil, msize - pil)) {
                req->status = NVME_CMP_FAILURE;
                goto out;
            }
        }

    if (ctx->copies) {
        return;
        goto out;
    }

    if (req->status) {
        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
    if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
        req->status = NVME_CMP_FAILURE;
        goto out;
    }

        g_free(ctx->bounce);
        g_free(ctx);
out:
    qemu_iovec_destroy(&ctx->data.iov);
    g_free(ctx->data.bounce);

        nvme_enqueue_req_completion(nvme_cq(req), req);
    qemu_iovec_destroy(&ctx->mdata.iov);
    g_free(ctx->mdata.bounce);

        return;
    }
    g_free(ctx);

    nvme_copy_in_complete(req);
    nvme_enqueue_req_completion(nvme_cq(req), req);
}

struct nvme_compare_ctx {
    QEMUIOVector iov;
    uint8_t *bounce;
};

static void nvme_compare_cb(void *opaque, int ret)
static void nvme_compare_data_cb(void *opaque, int ret)
{
    NvmeRequest *req = opaque;
    NvmeCtrl *n = nvme_ctrl(req);
    NvmeNamespace *ns = req->ns;
    BlockBackend *blk = ns->blkconf.blk;
    BlockAcctCookie *acct = &req->acct;
    BlockAcctStats *stats = blk_get_stats(blk);

    struct nvme_compare_ctx *ctx = req->opaque;
    g_autofree uint8_t *buf = NULL;
    uint16_t status;

    trace_pci_nvme_compare_cb(nvme_cid(req));
    trace_pci_nvme_compare_data_cb(nvme_cid(req));

    if (!ret) {
        block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
    } else {
        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
    if (ret) {
        block_acct_failed(stats, acct);
        nvme_aio_err(req, ret);
        goto out;
    }

    buf = g_malloc(ctx->iov.size);
    buf = g_malloc(ctx->data.iov.size);

    status = nvme_h2c(nvme_ctrl(req), buf, ctx->iov.size, req);
    status = nvme_bounce_data(n, buf, ctx->data.iov.size,
                              NVME_TX_DIRECTION_TO_DEVICE, req);
    if (status) {
        req->status = status;
        goto out;
    }

    if (memcmp(buf, ctx->bounce, ctx->iov.size)) {
    if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
        req->status = NVME_CMP_FAILURE;
        goto out;
    }

    if (nvme_msize(ns)) {
        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
        uint64_t slba = le64_to_cpu(rw->slba);
        uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
        size_t mlen = nvme_m2b(ns, nlb);
        uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);

        ctx->mdata.bounce = g_malloc(mlen);

        qemu_iovec_init(&ctx->mdata.iov, 1);
        qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);

        req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
                                    nvme_compare_mdata_cb, req);
        return;
    }

    block_acct_done(stats, acct);

out:
    qemu_iovec_destroy(&ctx->iov);
    g_free(ctx->bounce);
    qemu_iovec_destroy(&ctx->data.iov);
    g_free(ctx->data.bounce);
    g_free(ctx);

    nvme_enqueue_req_completion(nvme_cq(req), req);


@@ 1874,23 2569,95 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
    return status;
}

static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
{
    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
    NvmeNamespace *ns = req->ns;
    BlockBackend *blk = ns->blkconf.blk;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
    size_t len = nvme_l2b(ns, nlb);
    int64_t offset = nvme_l2b(ns, slba);
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint32_t reftag = le32_to_cpu(rw->reftag);
    NvmeBounceContext *ctx = NULL;
    uint16_t status;

    trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
        status = nvme_check_prinfo(ns, ctrl, slba, reftag);
        if (status) {
            return status;
        }

        if (ctrl & NVME_RW_PRINFO_PRACT) {
            return NVME_INVALID_PROT_INFO | NVME_DNR;
        }
    }

    if (len > n->page_size << n->params.vsl) {
        return NVME_INVALID_FIELD | NVME_DNR;
    }

    status = nvme_check_bounds(ns, slba, nlb);
    if (status) {
        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
        return status;
    }

    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
        status = nvme_check_dulbe(ns, slba, nlb);
        if (status) {
            return status;
        }
    }

    ctx = g_new0(NvmeBounceContext, 1);
    ctx->req = req;

    ctx->data.bounce = g_malloc(len);

    qemu_iovec_init(&ctx->data.iov, 1);
    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);

    block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
                     BLOCK_ACCT_READ);

    req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
                                nvme_verify_mdata_in_cb, ctx);
    return NVME_NO_COMPLETE;
}

static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
{
    NvmeNamespace *ns = req->ns;
    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
    g_autofree NvmeCopySourceRange *range = NULL;

    uint16_t nr = copy->nr + 1;
    uint8_t format = copy->control[0] & 0xf;
    uint32_t nlb = 0;

    /*
     * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the
     * NVME_RW_PRINFO constants.
     */
    uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
    uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;

    uint32_t nlb = 0;
    uint8_t *bounce = NULL, *bouncep = NULL;
    uint8_t *mbounce = NULL, *mbouncep = NULL;
    struct nvme_copy_ctx *ctx;
    uint16_t status;
    int i;

    trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
        ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
        return NVME_INVALID_FIELD | NVME_DNR;
    }

    if (!(n->id_ctrl.ocfs & (1 << format))) {
        trace_pci_nvme_err_copy_invalid_format(format);
        return NVME_INVALID_FIELD | NVME_DNR;


@@ 1900,39 2667,41 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
    }

    range = g_new(NvmeCopySourceRange, nr);
    ctx = g_new(struct nvme_copy_ctx, 1);
    ctx->ranges = g_new(NvmeCopySourceRange, nr);

    status = nvme_h2c(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange),
                      req);
    status = nvme_h2c(n, (uint8_t *)ctx->ranges,
                      nr * sizeof(NvmeCopySourceRange), req);
    if (status) {
        return status;
        goto out;
    }

    for (i = 0; i < nr; i++) {
        uint64_t slba = le64_to_cpu(range[i].slba);
        uint32_t _nlb = le16_to_cpu(range[i].nlb) + 1;
        uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
        uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;

        if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
            return NVME_CMD_SIZE_LIMIT | NVME_DNR;
            status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
            goto out;
        }

        status = nvme_check_bounds(ns, slba, _nlb);
        if (status) {
            trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
            return status;
            goto out;
        }

        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
            status = nvme_check_dulbe(ns, slba, _nlb);
            if (status) {
                return status;
                goto out;
            }
        }

        if (ns->params.zoned) {
            status = nvme_check_zone_read(ns, slba, _nlb);
            if (status) {
                return status;
                goto out;
            }
        }



@@ 1940,25 2709,28 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
    }

    if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
        status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
        goto out;
    }

    bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
    if (nvme_msize(ns)) {
        mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
    }

    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
                     BLOCK_ACCT_READ);

    ctx = g_new(struct nvme_copy_ctx, 1);

    ctx->bounce = bounce;
    ctx->mbounce = mbounce;
    ctx->nlb = nlb;
    ctx->copies = 1;

    req->opaque = ctx;

    for (i = 0; i < nr; i++) {
        uint64_t slba = le64_to_cpu(range[i].slba);
        uint32_t nlb = le16_to_cpu(range[i].nlb) + 1;
        uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
        uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;

        size_t len = nvme_l2b(ns, nlb);
        int64_t offset = nvme_l2b(ns, slba);


@@ 1977,6 2749,24 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
                       nvme_aio_copy_in_cb, in_ctx);

        bouncep += len;

        if (nvme_msize(ns)) {
            len = nvme_m2b(ns, nlb);
            offset = ns->mdata_offset + nvme_m2b(ns, slba);

            in_ctx = g_new(struct nvme_copy_in_ctx, 1);
            in_ctx->req = req;

            qemu_iovec_init(&in_ctx->iov, 1);
            qemu_iovec_add(&in_ctx->iov, mbouncep, len);

            ctx->copies++;

            blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
                           nvme_aio_copy_in_cb, in_ctx);

            mbouncep += len;
        }
    }

    /* account for the 1-initialization */


@@ 1987,6 2777,12 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
    }

    return NVME_NO_COMPLETE;

out:
    g_free(ctx->ranges);
    g_free(ctx);

    return status;
}

static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)


@@ 1996,14 2792,23 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
    BlockBackend *blk = ns->blkconf.blk;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
    size_t len = nvme_l2b(ns, nlb);
    uint16_t ctrl = le16_to_cpu(rw->control);
    size_t data_len = nvme_l2b(ns, nlb);
    size_t len = data_len;
    int64_t offset = nvme_l2b(ns, slba);
    uint8_t *bounce = NULL;
    struct nvme_compare_ctx *ctx = NULL;
    uint16_t status;

    trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
        return NVME_INVALID_PROT_INFO | NVME_DNR;
    }

    if (nvme_ns_ext(ns)) {
        len += nvme_m2b(ns, nlb);
    }

    status = nvme_check_mdts(n, len);
    if (status) {
        return status;


@@ 2022,18 2827,22 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
        }
    }

    bounce = g_malloc(len);
    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
    if (status) {
        return status;
    }

    ctx = g_new(struct nvme_compare_ctx, 1);
    ctx->bounce = bounce;
    ctx->data.bounce = g_malloc(data_len);

    req->opaque = ctx;

    qemu_iovec_init(&ctx->iov, 1);
    qemu_iovec_add(&ctx->iov, bounce, len);
    qemu_iovec_init(&ctx->data.iov, 1);
    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);

    block_acct_start(blk_get_stats(blk), &req->acct, len, BLOCK_ACCT_READ);
    blk_aio_preadv(blk, offset, &ctx->iov, 0, nvme_compare_cb, req);
    block_acct_start(blk_get_stats(blk), &req->acct, data_len,
                     BLOCK_ACCT_READ);
    blk_aio_preadv(blk, offset, &ctx->data.iov, 0, nvme_compare_data_cb, req);

    return NVME_NO_COMPLETE;
}


@@ 2056,7 2865,7 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)

        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
                         BLOCK_ACCT_FLUSH);
        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
        return NVME_NO_COMPLETE;
    }



@@ 2098,14 2907,28 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
    NvmeNamespace *ns = req->ns;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint64_t data_size = nvme_l2b(ns, nlb);
    uint64_t mapped_size = data_size;
    uint64_t data_offset;
    BlockBackend *blk = ns->blkconf.blk;
    uint16_t status;

    trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
    if (nvme_ns_ext(ns)) {
        mapped_size += nvme_m2b(ns, nlb);

    status = nvme_check_mdts(n, data_size);
        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
            bool pract = ctrl & NVME_RW_PRINFO_PRACT;

            if (pract && nvme_msize(ns) == 8) {
                mapped_size = data_size;
            }
        }
    }

    trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);

    status = nvme_check_mdts(n, mapped_size);
    if (status) {
        goto invalid;
    }


@@ 2124,11 2947,6 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
        }
    }

    status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd);
    if (status) {
        goto invalid;
    }

    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
        status = nvme_check_dulbe(ns, slba, nlb);
        if (status) {


@@ 2136,6 2954,15 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
        }
    }

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
        return nvme_dif_rw(n, req);
    }

    status = nvme_map_data(n, nlb, req);
    if (status) {
        goto invalid;
    }

    data_offset = nvme_l2b(ns, slba);

    block_acct_start(blk_get_stats(blk), &req->acct, data_size,


@@ 2155,18 2982,32 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
    NvmeNamespace *ns = req->ns;
    uint64_t slba = le64_to_cpu(rw->slba);
    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
    uint16_t ctrl = le16_to_cpu(rw->control);
    uint64_t data_size = nvme_l2b(ns, nlb);
    uint64_t mapped_size = data_size;
    uint64_t data_offset;
    NvmeZone *zone;
    NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
    BlockBackend *blk = ns->blkconf.blk;
    uint16_t status;

    if (nvme_ns_ext(ns)) {
        mapped_size += nvme_m2b(ns, nlb);

        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
            bool pract = ctrl & NVME_RW_PRINFO_PRACT;

            if (pract && nvme_msize(ns) == 8) {
                mapped_size -= nvme_m2b(ns, nlb);
            }
        }
    }

    trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
                         nvme_nsid(ns), nlb, data_size, slba);
                         nvme_nsid(ns), nlb, mapped_size, slba);

    if (!wrz) {
        status = nvme_check_mdts(n, data_size);
        status = nvme_check_mdts(n, mapped_size);
        if (status) {
            goto invalid;
        }


@@ 2182,19 3023,47 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
        zone = nvme_get_zone_by_slba(ns, slba);

        if (append) {
            bool piremap = !!(ctrl & NVME_RW_PIREMAP);

            if (unlikely(slba != zone->d.zslba)) {
                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
                status = NVME_INVALID_FIELD;
                goto invalid;
            }

            if (n->params.zasl && data_size > n->page_size << n->params.zasl) {
            if (n->params.zasl &&
                data_size > (uint64_t)n->page_size << n->params.zasl) {
                trace_pci_nvme_err_zasl(data_size);
                return NVME_INVALID_FIELD | NVME_DNR;
            }

            slba = zone->w_ptr;
            rw->slba = cpu_to_le64(slba);
            res->slba = cpu_to_le64(slba);

            switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
            case NVME_ID_NS_DPS_TYPE_1:
                if (!piremap) {
                    return NVME_INVALID_PROT_INFO | NVME_DNR;
                }

                /* fallthrough */

            case NVME_ID_NS_DPS_TYPE_2:
                if (piremap) {
                    uint32_t reftag = le32_to_cpu(rw->reftag);
                    rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
                }

                break;

            case NVME_ID_NS_DPS_TYPE_3:
                if (piremap) {
                    return NVME_INVALID_PROT_INFO | NVME_DNR;
                }

                break;
            }
        }

        status = nvme_check_zone_write(ns, zone, slba, nlb);


@@ 2212,8 3081,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,

    data_offset = nvme_l2b(ns, slba);

    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
        return nvme_dif_rw(n, req);
    }

    if (!wrz) {
        status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd);
        status = nvme_map_data(n, nlb, req);
        if (status) {
            goto invalid;
        }


@@ 2226,6 3099,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
                                           BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
                                           req);
    }

    return NVME_NO_COMPLETE;

invalid:


@@ 2619,12 3493,13 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
    uint32_t zone_idx, zra, zrasf, partial;
    uint64_t max_zones, nr_zones = 0;
    uint16_t status;
    uint64_t slba, capacity = nvme_ns_nlbas(ns);
    uint64_t slba;
    NvmeZoneDescr *z;
    NvmeZone *zone;
    NvmeZoneReportHeader *header;
    void *buf, *buf_p;
    size_t zone_entry_sz;
    int i;

    req->status = NVME_SUCCESS;



@@ 2666,7 3541,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
    buf = g_malloc0(data_size);

    zone = &ns->zone_array[zone_idx];
    for (; slba < capacity; slba += ns->zone_size) {
    for (i = zone_idx; i < ns->num_zones; i++) {
        if (partial && nr_zones >= max_zones) {
            break;
        }


@@ 2718,6 3593,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
{
    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
    uint16_t status;

    trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
                          req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));


@@ 2759,6 3635,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
        return NVME_INVALID_OPCODE | NVME_DNR;
    }

    status = nvme_ns_status(req->ns);
    if (unlikely(status)) {
        return status;
    }

    switch (req->cmd.opcode) {
    case NVME_CMD_WRITE_ZEROES:
        return nvme_write_zeroes(n, req);


@@ 2772,6 3653,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
        return nvme_compare(n, req);
    case NVME_CMD_DSM:
        return nvme_dsm(n, req);
    case NVME_CMD_VERIFY:
        return nvme_verify(n, req);
    case NVME_CMD_COPY:
        return nvme_copy(n, req);
    case NVME_CMD_ZONE_MGMT_SEND:


@@ 3288,12 4171,14 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
{
    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
    uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
    NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;

    trace_pci_nvme_identify_ctrl_csi(c->csi);

    switch (c->csi) {
    case NVME_CSI_NVM:
        ((NvmeIdCtrlNvm *)&id)->dmrsl = cpu_to_le32(n->dmrsl);
        id_nvm->vsl = n->params.vsl;
        id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
        break;

    case NVME_CSI_ZONED:


@@ 4056,6 4941,134 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
    return NVME_SUCCESS;
}

static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
                               uint8_t mset, uint8_t pi, uint8_t pil,
                               NvmeRequest *req)
{
    int64_t len, offset;
    struct nvme_aio_format_ctx *ctx;
    BlockBackend *blk = ns->blkconf.blk;
    uint16_t ms;
    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
    int *count;

    if (ns->params.zoned) {
        return NVME_INVALID_FORMAT | NVME_DNR;
    }

    trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil);

    if (lbaf > ns->id_ns.nlbaf) {
        return NVME_INVALID_FORMAT | NVME_DNR;
    }

    ms = ns->id_ns.lbaf[lbaf].ms;

    if (pi && (ms < sizeof(NvmeDifTuple))) {
        return NVME_INVALID_FORMAT | NVME_DNR;
    }

    if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
        return NVME_INVALID_FIELD | NVME_DNR;
    }

    nvme_ns_drain(ns);
    nvme_ns_shutdown(ns);
    nvme_ns_cleanup(ns);

    ns->id_ns.dps = (pil << 3) | pi;
    ns->id_ns.flbas = lbaf | (mset << 4);

    nvme_ns_init_format(ns);

    ns->status = NVME_FORMAT_IN_PROGRESS;

    len = ns->size;
    offset = 0;

    count = g_new(int, 1);
    *count = 1;

    (*num_formats)++;

    while (len) {
        ctx = g_new(struct nvme_aio_format_ctx, 1);
        ctx->req = req;
        ctx->ns = ns;
        ctx->count = count;

        size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);

        (*count)++;

        blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP,
                              nvme_aio_format_cb, ctx);

        offset += bytes;
        len -= bytes;

    }

    (*count)--;

    return NVME_NO_COMPLETE;
}

static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
{
    NvmeNamespace *ns;
    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
    uint8_t lbaf = dw10 & 0xf;
    uint8_t mset = (dw10 >> 4) & 0x1;
    uint8_t pi = (dw10 >> 5) & 0x7;
    uint8_t pil = (dw10 >> 8) & 0x1;
    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
    uint16_t status;
    int i;

    trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil);

    /* 1-initialize; see the comment in nvme_dsm */
    *num_formats = 1;

    if (nsid != NVME_NSID_BROADCAST) {
        if (!nvme_nsid_valid(n, nsid)) {
            return NVME_INVALID_NSID | NVME_DNR;
        }

        ns = nvme_ns(n, nsid);
        if (!ns) {
            return NVME_INVALID_FIELD | NVME_DNR;
        }

        status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
        if (status && status != NVME_NO_COMPLETE) {
            req->status = status;
        }
    } else {
        for (i = 1; i <= n->num_namespaces; i++) {
            ns = nvme_ns(n, i);
            if (!ns) {
                continue;
            }

            status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
            if (status && status != NVME_NO_COMPLETE) {
                req->status = status;
                break;
            }
        }
    }

    /* account for the 1-initialization */
    if (--(*num_formats)) {
        return NVME_NO_COMPLETE;
    }

    return req->status;
}

static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
{
    trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,


@@ 4094,6 5107,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
        return nvme_aer(n, req);
    case NVME_ADM_CMD_NS_ATTACHMENT:
        return nvme_ns_attachment(n, req);
    case NVME_ADM_CMD_FORMAT_NVM:
        return nvme_format(n, req);
    default:
        assert(false);
    }


@@ 4836,6 5851,11 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
                   "than or equal to mdts (Maximum Data Transfer Size)");
        return;
    }

    if (!n->params.vsl) {
        error_setg(errp, "vsl must be non-zero");
        return;
    }
}

static void nvme_init_state(NvmeCtrl *n)


@@ 5065,7 6085,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)

    id->mdts = n->params.mdts;
    id->ver = cpu_to_le32(NVME_SPEC_VER);
    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT);
    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
    id->cntrltype = 0x1;

    /*


@@ 5236,6 6256,7 @@ static Property nvme_props[] = {
    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
    DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
    DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
    DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
    DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
    DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),

M hw/block/nvme.h => hw/block/nvme.h +42 -2
@@ 2,6 2,7 @@
#define HW_NVME_H

#include "block/nvme.h"
#include "hw/pci/pci.h"
#include "nvme-subsys.h"
#include "nvme-ns.h"



@@ 25,6 26,7 @@ typedef struct NvmeParams {
    uint8_t  aerl;
    uint32_t aer_max_queued;
    uint8_t  mdts;
    uint8_t  vsl;
    bool     use_intel_id;
    uint8_t  zasl;
    bool     legacy_cmb;


@@ 62,6 64,15 @@ typedef struct NvmeRequest {
    QTAILQ_ENTRY(NvmeRequest)entry;
} NvmeRequest;

typedef struct NvmeBounceContext {
    NvmeRequest *req;

    struct {
        QEMUIOVector iov;
        uint8_t *bounce;
    } data, mdata;
} NvmeBounceContext;

static inline const char *nvme_adm_opc_str(uint8_t opc)
{
    switch (opc) {


@@ 75,6 86,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
    case NVME_ADM_CMD_SET_FEATURES:     return "NVME_ADM_CMD_SET_FEATURES";
    case NVME_ADM_CMD_GET_FEATURES:     return "NVME_ADM_CMD_GET_FEATURES";
    case NVME_ADM_CMD_ASYNC_EV_REQ:     return "NVME_ADM_CMD_ASYNC_EV_REQ";
    case NVME_ADM_CMD_FORMAT_NVM:       return "NVME_ADM_CMD_FORMAT_NVM";
    default:                            return "NVME_ADM_CMD_UNKNOWN";
    }
}


@@ 88,6 100,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
    case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
    case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
    case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
    case NVME_CMD_VERIFY:           return "NVME_NVM_CMD_VERIFY";
    case NVME_CMD_COPY:             return "NVME_NVM_CMD_COPY";
    case NVME_CMD_ZONE_MGMT_SEND:   return "NVME_ZONED_CMD_MGMT_SEND";
    case NVME_CMD_ZONE_MGMT_RECV:   return "NVME_ZONED_CMD_MGMT_RECV";


@@ 236,12 249,18 @@ static inline bool nvme_ns_is_attached(NvmeCtrl *n, NvmeNamespace *ns)

static inline void nvme_ns_attach(NvmeCtrl *n, NvmeNamespace *ns)
{
    n->namespaces[nvme_nsid(ns) - 1] = ns;
    uint32_t nsid = nvme_nsid(ns);
    assert(nsid && nsid <= NVME_MAX_NAMESPACES);

    n->namespaces[nsid - 1] = ns;
}

static inline void nvme_ns_detach(NvmeCtrl *n, NvmeNamespace *ns)
{
    n->namespaces[nvme_nsid(ns) - 1] = NULL;
    uint32_t nsid = nvme_nsid(ns);
    assert(nsid && nsid <= NVME_MAX_NAMESPACES);

    n->namespaces[nsid - 1] = NULL;
}

static inline NvmeCQueue *nvme_cq(NvmeRequest *req)


@@ 258,6 277,27 @@ static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
    return sq->ctrl;
}

static inline uint16_t nvme_cid(NvmeRequest *req)
{
    if (!req) {
        return 0xffff;
    }

    return le16_to_cpu(req->cqe.cid);
}

typedef enum NvmeTxDirection {
    NVME_TX_DIRECTION_TO_DEVICE   = 0,
    NVME_TX_DIRECTION_FROM_DEVICE = 1,
} NvmeTxDirection;

int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
                          NvmeTxDirection dir, NvmeRequest *req);
uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
                           NvmeTxDirection dir, NvmeRequest *req);
void nvme_rw_complete_cb(void *opaque, int ret);
uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
                       NvmeCmd *cmd);

#endif /* HW_NVME_H */

M hw/block/trace-events => hw/block/trace-events +21 -1
@@ 41,19 41,39 @@ pci_nvme_map_sgl(uint8_t typ, uint64_t len) "type 0x%"PRIx8" len %"PRIu64""
pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
pci_nvme_format(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
pci_nvme_format_ns(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
pci_nvme_format_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_misc_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_dif_rw(uint8_t pract, uint8_t prinfo) "pract 0x%"PRIx8" prinfo 0x%"PRIx8""
pci_nvme_dif_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_dif_rw_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_dif_rw_mdata_out_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_dif_rw_check_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_dif_pract_generate_dif(size_t len, size_t lba_size, size_t chksum_len, uint16_t apptag, uint32_t reftag) "len %zu lba_size %zu chksum_len %zu apptag 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_dif_check(uint8_t prinfo, uint16_t chksum_len) "prinfo 0x%"PRIx8" chksum_len %"PRIu16""
pci_nvme_dif_prchk_disabled(uint16_t apptag, uint32_t reftag) "apptag 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_dif_prchk_guard(uint16_t guard, uint16_t crc) "guard 0x%"PRIx16" crc 0x%"PRIx16""
pci_nvme_dif_prchk_apptag(uint16_t apptag, uint16_t elbat, uint16_t elbatm) "apptag 0x%"PRIx16" elbat 0x%"PRIx16" elbatm 0x%"PRIx16""
pci_nvme_dif_prchk_reftag(uint32_t reftag, uint32_t elbrt) "reftag 0x%"PRIx32" elbrt 0x%"PRIx32""
pci_nvme_copy(uint16_t cid, uint32_t nsid, uint16_t nr, uint8_t format) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu16" format 0x%"PRIx8""
pci_nvme_copy_source_range(uint64_t slba, uint32_t nlb) "slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_copy_in_complete(uint16_t cid) "cid %"PRIu16""
pci_nvme_copy_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_verify(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_compare_mdata_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""

M include/block/nvme.h => include/block/nvme.h +23 -6
@@ 580,6 580,7 @@ enum NvmeIoCommands {
    NVME_CMD_COMPARE            = 0x05,
    NVME_CMD_WRITE_ZEROES       = 0x08,
    NVME_CMD_DSM                = 0x09,
    NVME_CMD_VERIFY             = 0x0c,
    NVME_CMD_COPY               = 0x19,
    NVME_CMD_ZONE_MGMT_SEND     = 0x79,
    NVME_CMD_ZONE_MGMT_RECV     = 0x7a,


@@ 696,12 697,17 @@ enum {
    NVME_RW_DSM_LATENCY_LOW     = 3 << 4,
    NVME_RW_DSM_SEQ_REQ         = 1 << 6,
    NVME_RW_DSM_COMPRESSED      = 1 << 7,
    NVME_RW_PIREMAP             = 1 << 9,
    NVME_RW_PRINFO_PRACT        = 1 << 13,
    NVME_RW_PRINFO_PRCHK_GUARD  = 1 << 12,
    NVME_RW_PRINFO_PRCHK_APP    = 1 << 11,
    NVME_RW_PRINFO_PRCHK_REF    = 1 << 10,
    NVME_RW_PRINFO_PRCHK_MASK   = 7 << 10,

};

#define NVME_RW_PRINFO(control) ((control >> 10) & 0xf)

typedef struct QEMU_PACKED NvmeDsmCmd {
    uint8_t     opcode;
    uint8_t     flags;


@@ 822,6 828,7 @@ enum NvmeStatusCodes {
    NVME_CAP_EXCEEDED           = 0x0081,
    NVME_NS_NOT_READY           = 0x0082,
    NVME_NS_RESV_CONFLICT       = 0x0083,
    NVME_FORMAT_IN_PROGRESS     = 0x0084,
    NVME_INVALID_CQID           = 0x0100,
    NVME_INVALID_QID            = 0x0101,
    NVME_MAX_QSIZE_EXCEEDED     = 0x0102,


@@ 1079,6 1086,7 @@ enum NvmeIdCtrlOncs {
    NVME_ONCS_FEATURES      = 1 << 4,
    NVME_ONCS_RESRVATIONS   = 1 << 5,
    NVME_ONCS_TIMESTAMP     = 1 << 6,
    NVME_ONCS_VERIFY        = 1 << 7,
    NVME_ONCS_COPY          = 1 << 8,
};



@@ 1324,14 1332,22 @@ typedef struct QEMU_PACKED NvmeIdNsZoned {
#define NVME_ID_NS_DPC_TYPE_MASK            0x7

enum NvmeIdNsDps {
    DPS_TYPE_NONE   = 0,
    DPS_TYPE_1      = 1,
    DPS_TYPE_2      = 2,
    DPS_TYPE_3      = 3,
    DPS_TYPE_MASK   = 0x7,
    DPS_FIRST_EIGHT = 8,
    NVME_ID_NS_DPS_TYPE_NONE   = 0,
    NVME_ID_NS_DPS_TYPE_1      = 1,
    NVME_ID_NS_DPS_TYPE_2      = 2,
    NVME_ID_NS_DPS_TYPE_3      = 3,
    NVME_ID_NS_DPS_TYPE_MASK   = 0x7,
    NVME_ID_NS_DPS_FIRST_EIGHT = 8,
};

#define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK)

typedef struct NvmeDifTuple {
    uint16_t guard;
    uint16_t apptag;
    uint32_t reftag;
} NvmeDifTuple;

enum NvmeZoneAttr {
    NVME_ZA_FINISHED_BY_CTLR         = 1 << 0,
    NVME_ZA_FINISH_RECOMMENDED       = 1 << 1,


@@ 1428,5 1444,6 @@ static inline void _nvme_check_size(void)
    QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4);
    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
    QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 8);
}
#endif