From 6f7b9f7b6fdf8486e2f17fe3ddaeafe598c3b3a6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 30 Mar 2023 14:23:40 +0200 Subject: [PATCH 01/76] vnc: avoid underflow when accessing user-provided address If hostlen is zero, there is a possibility that addrstr[hostlen - 1] underflows and, if a closing bracked is there, hostlen - 2 is passed to g_strndup() on the next line. If websocket==false then addrstr[0] would be a colon, but if websocket==true this could in principle happen. Fix it by checking hostlen. Reported by Coverity. Signed-off-by: Paolo Bonzini (cherry picked from commit 3f9c41c5df9617510d8533cf6588172efb3df34b) Signed-off-by: Michael Tokarev --- ui/vnc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/vnc.c b/ui/vnc.c index bbd8b6baae..9d8a24dd8a 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -3751,7 +3751,7 @@ static int vnc_display_get_address(const char *addrstr, addr->type = SOCKET_ADDRESS_TYPE_INET; inet = &addr->u.inet; - if (addrstr[0] == '[' && addrstr[hostlen - 1] == ']') { + if (hostlen && addrstr[0] == '[' && addrstr[hostlen - 1] == ']') { inet->host = g_strndup(addrstr + 1, hostlen - 2); } else { inet->host = g_strndup(addrstr, hostlen); From 3ed99d232c1173aff3806908e29f29b30af7d58e Mon Sep 17 00:00:00 2001 From: Yang Zhong Date: Thu, 6 Apr 2023 02:40:41 -0400 Subject: [PATCH 02/76] target/i386: Change wrong XFRM value in SGX CPUID leaf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous patch wrongly replaced FEAT_XSAVE_XCR0_{LO|HI} with FEAT_XSAVE_XSS_{LO|HI} in CPUID(EAX=12,ECX=1):{ECX,EDX}. As a result, SGX enclaves only supported SSE and x87 feature (xfrm=0x3). Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") Signed-off-by: Yang Zhong Reviewed-by: Yang Weijiang Reviewed-by: Kai Huang Message-Id: <20230406064041.420039-1-yang.zhong@linux.intel.com> Signed-off-by: Paolo Bonzini (cherry picked from commit 72497cff896fecf74306ed33626c30e43633cdd6) Signed-off-by: Michael Tokarev --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6576287e5b..f083ff4335 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -5718,8 +5718,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } else { *eax &= env->features[FEAT_SGX_12_1_EAX]; *ebx &= 0; /* ebx reserve */ - *ecx &= env->features[FEAT_XSAVE_XSS_LO]; - *edx &= env->features[FEAT_XSAVE_XSS_HI]; + *ecx &= env->features[FEAT_XSAVE_XCR0_LO]; + *edx &= env->features[FEAT_XSAVE_XCR0_HI]; /* FP and SSE are always allowed regardless of XSAVE/XCR0. */ *ecx |= XSTATE_FP_MASK | XSTATE_SSE_MASK; From ac7f07ebc8facf0c96905db8b6b6a453b284edfb Mon Sep 17 00:00:00 2001 From: Axel Heider Date: Thu, 20 Apr 2023 10:21:14 +0100 Subject: [PATCH 03/76] hw/timer/imx_epit: don't shadow variable Fix issue reported by Coverity. Signed-off-by: Axel Heider Message-id: 168070611775.20412.2883242077302841473-1@git.sr.ht Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell (cherry picked from commit 542fd43d79327dabe62e49ff584ca60b6184923a) Signed-off-by: Michael Tokarev --- hw/timer/imx_epit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/timer/imx_epit.c b/hw/timer/imx_epit.c index 3a869782bc..0821c62cd1 100644 --- a/hw/timer/imx_epit.c +++ b/hw/timer/imx_epit.c @@ -179,7 +179,7 @@ static void imx_epit_update_compare_timer(IMXEPITState *s) * the compare value. Otherwise it may fire at most once in the * current round. */ - bool is_oneshot = (limit >= s->cmp); + is_oneshot = (limit >= s->cmp); if (counter >= s->cmp) { /* The compare timer fires in the current round. */ counter -= s->cmp; From 134a1a33207723bf0adc35e2272b6a147484d4b8 Mon Sep 17 00:00:00 2001 From: Axel Heider Date: Thu, 20 Apr 2023 10:21:14 +0100 Subject: [PATCH 04/76] hw/timer/imx_epit: fix limit check Fix the limit check. If the limit is less than the compare value, the timer can never reach this value, thus it will never fire. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1491 Signed-off-by: Axel Heider Message-id: 168070611775.20412.2883242077302841473-2@git.sr.ht Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell (cherry picked from commit 25d758175dfbfd53e02b4a52ac68cbd6eb05f648) Signed-off-by: Michael Tokarev --- hw/timer/imx_epit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/timer/imx_epit.c b/hw/timer/imx_epit.c index 0821c62cd1..640e4399c2 100644 --- a/hw/timer/imx_epit.c +++ b/hw/timer/imx_epit.c @@ -179,7 +179,7 @@ static void imx_epit_update_compare_timer(IMXEPITState *s) * the compare value. Otherwise it may fire at most once in the * current round. */ - is_oneshot = (limit >= s->cmp); + is_oneshot = (limit < s->cmp); if (counter >= s->cmp) { /* The compare timer fires in the current round. */ counter -= s->cmp; From bb47b5bc2e7a1e7e12c542b8e6510fa37c23b659 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 18 Apr 2023 11:04:49 +0200 Subject: [PATCH 05/76] acpi: pcihp: allow repeating hot-unplug requests with Q35 using ACPI PCI hotplug by default, user's request to unplug device is ignored when it's issued before guest OS has been booted. And any additional attempt to request device hot-unplug afterwards results in following error: "Device XYZ is already in the process of unplug" arguably it can be considered as a regression introduced by [2], before which it was possible to issue unplug request multiple times. Accept new uplug requests after timeout (1ms). This brings ACPI PCI hotplug on par with native PCIe unplug behavior [1] and allows user to repeat unplug requests at propper times. Set expire timeout to arbitrary 1msec so user won't be able to flood guest with SCI interrupts by calling device_del in tight loop. PS: ACPI spec doesn't mandate what OSPM can do with GPEx.status bits set before it's booted => it's impl. depended. Status bits may be retained (I tested with one Windows version) or cleared (Linux since 2.6 kernel times) during guest's ACPI subsystem initialization. Clearing status bits (though not wrong per se) hides the unplug event from guest, and it's upto user to repeat device_del later when guest is able to handle unplug requests. 1) 18416c62e3 ("pcie: expire pending delete") 2) Fixes: cce8944cc9ef ("qdev-monitor: Forbid repeated device_del") Signed-off-by: Igor Mammedov Acked-by: Gerd Hoffmann CC: mst@redhat.com CC: anisinha@redhat.com CC: jusual@redhat.com CC: kraxel@redhat.com Message-Id: <20230418090449.2155757-1-imammedo@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Ani Sinha (cherry picked from commit 0f689cf5ada4d5df5ab95c7f7aa9fc221afa855d) Signed-off-by: Michael Tokarev --- hw/acpi/pcihp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index dcfb779a7a..cdd6f775a1 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -357,6 +357,16 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, * acpi_pcihp_eject_slot() when the operation is completed. */ pdev->qdev.pending_deleted_event = true; + /* if unplug was requested before OSPM is initialized, + * linux kernel will clear GPE0.sts[] bits during boot, which effectively + * hides unplug event. And than followup qmp_device_del() calls remain + * blocked by above flag permanently. + * Unblock qmp_device_del() by setting expire limit, so user can + * repeat unplug request later when OSPM has been booted. + */ + pdev->qdev.pending_deleted_expires_ms = + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); /* 1 msec */ + s->acpi_pcihp_pci_status[bsel].down |= (1U << slot); acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS); } From f528cfc3fa91dacc0f5e0bf494164f7d4548e35f Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Mon, 24 Apr 2023 18:39:02 +0800 Subject: [PATCH 06/76] block/monitor: Fix crash when executing HMP commit hmp_commit() calls blk_is_available() from a non-coroutine context (and in the main loop). blk_is_available() is a co_wrapper_mixed_bdrv_rdlock function, and in the non-coroutine context it calls AIO_WAIT_WHILE(), which crashes if the aio_context lock is not taken before. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1615 Signed-off-by: Wang Liang Message-Id: <20230424103902.45265-1-wangliangzz@126.com> Reviewed-by: Emanuele Giuseppe Esposito Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf (cherry picked from commit 8c1e8fb2e7fc2cbeb57703e143965a4cd3ad301a) Signed-off-by: Michael Tokarev --- block/monitor/block-hmp-cmds.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 2846083546..ca2599de44 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -214,15 +214,17 @@ void hmp_commit(Monitor *mon, const QDict *qdict) error_report("Device '%s' not found", device); return; } - if (!blk_is_available(blk)) { - error_report("Device '%s' has no medium", device); - return; - } bs = bdrv_skip_implicit_filters(blk_bs(blk)); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); + if (!blk_is_available(blk)) { + error_report("Device '%s' has no medium", device); + aio_context_release(aio_context); + return; + } + ret = bdrv_commit(bs); aio_context_release(aio_context); From 8c3cf36260b428087cce801166c0bd1a7f967dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 24 Apr 2023 10:22:37 +0100 Subject: [PATCH 07/76] qemu-options: finesse the recommendations around -blockdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are a bit premature in recommending -blockdev/-device as the best way to configure block devices. It seems there are times the more human friendly -drive still makes sense especially when -snapshot is involved. Improve the language to hopefully make things clearer. Suggested-by: Michael Tokarev Signed-off-by: Alex Bennée Reviewed-by: Thomas Huth Cc: Markus Armbruster Cc: Kevin Wolf Message-Id: <20230424092249.58552-7-alex.bennee@linaro.org> (cherry picked from commit c1654c3e37c31fb638597efedcd07d071837b78b) Signed-off-by: Michael Tokarev --- qemu-options.hx | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index 59bdf67a2c..4b8855a4f7 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1143,10 +1143,22 @@ have gone through several iterations as the feature set and complexity of the block layer have grown. Many online guides to QEMU often reference older and deprecated options, which can lead to confusion. -The recommended modern way to describe disks is to use a combination of +The most explicit way to describe disks is to use a combination of ``-device`` to specify the hardware device and ``-blockdev`` to describe the backend. The device defines what the guest sees and the -backend describes how QEMU handles the data. +backend describes how QEMU handles the data. It is the only guaranteed +stable interface for describing block devices and as such is +recommended for management tools and scripting. + +The ``-drive`` option combines the device and backend into a single +command line option which is a more human friendly. There is however no +interface stability guarantee although some older board models still +need updating to work with the modern blockdev forms. + +Older options like ``-hda`` are essentially macros which expand into +``-drive`` options for various drive interfaces. The original forms +bake in a lot of assumptions from the days when QEMU was emulating a +legacy PC, they are not recommended for modern configurations. ERST @@ -1639,6 +1651,14 @@ SRST the raw disk image you use is not written back. You can however force the write back by pressing C-a s (see the :ref:`disk images` chapter in the System Emulation Users Guide). + + .. warning:: + snapshot is incompatible with ``-blockdev`` (instead use qemu-img + to manually create snapshot images to attach to your blockdev). + If you have mixed ``-blockdev`` and ``-drive`` declarations you + can use the 'snapshot' property on your drive declarations + instead of this global option. + ERST DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev, From 9448a0fa112e1e7b1cd11e3bf7b2b1711a89a90d Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 20 Apr 2023 13:22:56 +0100 Subject: [PATCH 08/76] docs/about/deprecated.rst: Add "since 7.1" tag to dtb-kaslr-seed deprecation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 5242876f37ca we deprecated the dtb-kaslr-seed property of the virt board, but forgot the "since n.n" tag in the documentation of this in deprecated.rst. This deprecation note first appeared in the 7.1 release, so retrospectively add the correct "since 7.1" annotation to it. Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alex Bennée Message-id: 20230420122256.1023709-1-peter.maydell@linaro.org (cherry picked from commit ac64ebbecf80f6bc764d120f85fe9fa28fbd9e85) Signed-off-by: Michael Tokarev --- docs/about/deprecated.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 1ca9dc33d6..914938fd76 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -219,8 +219,8 @@ Use the more generic event ``DEVICE_UNPLUG_GUEST_ERROR`` instead. System emulator machines ------------------------ -Arm ``virt`` machine ``dtb-kaslr-seed`` property -'''''''''''''''''''''''''''''''''''''''''''''''' +Arm ``virt`` machine ``dtb-kaslr-seed`` property (since 7.1) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' The ``dtb-kaslr-seed`` property on the ``virt`` board has been deprecated; use the new name ``dtb-randomness`` instead. The new name From f0c5a780292bd405bbce818b63757313cafcf262 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Thu, 20 Apr 2023 10:21:15 +0100 Subject: [PATCH 09/76] target/arm: Initialize debug capabilities only once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_arm_init_debug() used to be called several times on a SMP system as kvm_arch_init_vcpu() calls it. Move the call to kvm_arch_init() to make sure it will be called only once; otherwise it will overwrite pointers to memory allocated with the previous call and leak it. Fixes: e4482ab7e3 ("target-arm: kvm - add support for HW assisted debug") Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Akihiko Odaki Message-id: 20230405153644.25300-1-akihiko.odaki@daynix.com Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell (cherry picked from commit ad5c6ddea327758daa9f0e6edd916be39dce7dca) Signed-off-by: Michael Tokarev --- target/arm/kvm.c | 2 ++ target/arm/kvm64.c | 18 ++++-------------- target/arm/kvm_arm.h | 8 ++++++++ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index f022c644d2..84da49332c 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -280,6 +280,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + kvm_arm_init_debug(s); + return ret; } diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 1197253d12..810db33ccb 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -74,24 +74,16 @@ GArray *hw_breakpoints, *hw_watchpoints; #define get_hw_bp(i) (&g_array_index(hw_breakpoints, HWBreakpoint, i)) #define get_hw_wp(i) (&g_array_index(hw_watchpoints, HWWatchpoint, i)) -/** - * kvm_arm_init_debug() - check for guest debug capabilities - * @cs: CPUState - * - * kvm_check_extension returns the number of debug registers we have - * or 0 if we have none. - * - */ -static void kvm_arm_init_debug(CPUState *cs) +void kvm_arm_init_debug(KVMState *s) { - have_guest_debug = kvm_check_extension(cs->kvm_state, + have_guest_debug = kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG); - max_hw_wps = kvm_check_extension(cs->kvm_state, KVM_CAP_GUEST_DEBUG_HW_WPS); + max_hw_wps = kvm_check_extension(s, KVM_CAP_GUEST_DEBUG_HW_WPS); hw_watchpoints = g_array_sized_new(true, true, sizeof(HWWatchpoint), max_hw_wps); - max_hw_bps = kvm_check_extension(cs->kvm_state, KVM_CAP_GUEST_DEBUG_HW_BPS); + max_hw_bps = kvm_check_extension(s, KVM_CAP_GUEST_DEBUG_HW_BPS); hw_breakpoints = g_array_sized_new(true, true, sizeof(HWBreakpoint), max_hw_bps); return; @@ -920,8 +912,6 @@ int kvm_arch_init_vcpu(CPUState *cs) } cpu->mp_affinity = mpidr & ARM64_AFFINITY_MASK; - kvm_arm_init_debug(cs); - /* Check whether user space can specify guest syndrome value */ kvm_arm_init_serror_injection(cs); diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 99017b635c..330fbe5c72 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -18,6 +18,14 @@ #define KVM_ARM_VGIC_V2 (1 << 0) #define KVM_ARM_VGIC_V3 (1 << 1) +/** + * kvm_arm_init_debug() - initialize guest debug capabilities + * @s: KVMState + * + * Should be called only once before using guest debug capabilities. + */ +void kvm_arm_init_debug(KVMState *s); + /** * kvm_arm_vcpu_init: * @cs: CPUState From 61ef0506390dad5210cac236d834450997b79de3 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 24 Apr 2023 16:19:19 +0100 Subject: [PATCH 10/76] hw/net/msf2-emac: Don't modify descriptor in-place in emac_store_desc() The msf2-emac ethernet controller has functions emac_load_desc() and emac_store_desc() which read and write the in-memory descriptor blocks and handle conversion between guest and host endianness. As currently written, emac_store_desc() does the endianness conversion in-place; this means that it effectively consumes the input EmacDesc struct, because on a big-endian host the fields will be overwritten with the little-endian versions of their values. Unfortunately, in all the callsites the code continues to access fields in the EmacDesc struct after it has called emac_store_desc() -- specifically, it looks at the d.next field. The effect of this is that on a big-endian host networking doesn't work because the address of the next descriptor is corrupted. We could fix this by making the callsite avoid using the struct; but it's more robust to have emac_store_desc() leave its input alone. (emac_load_desc() also does an in-place conversion, but here this is fine, because the function is supposed to be initializing the struct.) Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Thomas Huth Message-id: 20230424151919.1333299-1-peter.maydell@linaro.org (cherry picked from commit d565f58b38424e9a390a7ea33ff7477bab693fda) Signed-off-by: Michael Tokarev --- hw/net/msf2-emac.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hw/net/msf2-emac.c b/hw/net/msf2-emac.c index 7ccd3e5142..db3a04deb1 100644 --- a/hw/net/msf2-emac.c +++ b/hw/net/msf2-emac.c @@ -118,14 +118,18 @@ static void emac_load_desc(MSF2EmacState *s, EmacDesc *d, hwaddr desc) d->next = le32_to_cpu(d->next); } -static void emac_store_desc(MSF2EmacState *s, EmacDesc *d, hwaddr desc) +static void emac_store_desc(MSF2EmacState *s, const EmacDesc *d, hwaddr desc) { - /* Convert from host endianness into LE. */ - d->pktaddr = cpu_to_le32(d->pktaddr); - d->pktsize = cpu_to_le32(d->pktsize); - d->next = cpu_to_le32(d->next); + EmacDesc outd; + /* + * Convert from host endianness into LE. We use a local struct because + * calling code may still want to look at the fields afterwards. + */ + outd.pktaddr = cpu_to_le32(d->pktaddr); + outd.pktsize = cpu_to_le32(d->pktsize); + outd.next = cpu_to_le32(d->next); - address_space_write(&s->dma_as, desc, MEMTXATTRS_UNSPECIFIED, d, sizeof *d); + address_space_write(&s->dma_as, desc, MEMTXATTRS_UNSPECIFIED, &outd, sizeof outd); } static void msf2_dma_tx(MSF2EmacState *s) From 168f193c5be54fc9a6d725dbb9974c0d2815792a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 24 Apr 2023 16:27:15 +0100 Subject: [PATCH 11/76] hw/arm/boot: Make write_bootloader() public as arm_write_bootloader() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arm boot.c code includes a utility function write_bootloader() which assists in writing a boot-code fragment into guest memory, including handling endianness and fixing it up with entry point addresses and similar things. This is useful not just for the boot.c code but also in board model code, so rename it to arm_write_bootloader() and make it globally visible. Since we are making it public, make its API a little neater: move the AddressSpace* argument to be next to the hwaddr argument, and allow the fixupcontext array to be const, since we never modify it in this function. Cc: qemu-stable@nongnu.org Signed-off-by: Cédric Le Goater Tested-by: Cédric Le Goater Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Peter Maydell Message-id: 20230424152717.1333930-2-peter.maydell@linaro.org [PMM: Split out from another patch by Cédric, added doc comment] Signed-off-by: Peter Maydell (cherry picked from commit 0fe43f0abf19bbe24df3dbf0613bb47ed55f1482) Signed-off-by: Michael Tokarev --- hw/arm/boot.c | 35 +++++++------------------------ include/hw/arm/boot.h | 49 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 54f6a3e0b3..720f22531a 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -60,26 +60,6 @@ AddressSpace *arm_boot_address_space(ARMCPU *cpu, return cpu_get_address_space(cs, asidx); } -typedef enum { - FIXUP_NONE = 0, /* do nothing */ - FIXUP_TERMINATOR, /* end of insns */ - FIXUP_BOARDID, /* overwrite with board ID number */ - FIXUP_BOARD_SETUP, /* overwrite with board specific setup code address */ - FIXUP_ARGPTR_LO, /* overwrite with pointer to kernel args */ - FIXUP_ARGPTR_HI, /* overwrite with pointer to kernel args (high half) */ - FIXUP_ENTRYPOINT_LO, /* overwrite with kernel entry point */ - FIXUP_ENTRYPOINT_HI, /* overwrite with kernel entry point (high half) */ - FIXUP_GIC_CPU_IF, /* overwrite with GIC CPU interface address */ - FIXUP_BOOTREG, /* overwrite with boot register address */ - FIXUP_DSB, /* overwrite with correct DSB insn for cpu */ - FIXUP_MAX, -} FixupType; - -typedef struct ARMInsnFixup { - uint32_t insn; - FixupType fixup; -} ARMInsnFixup; - static const ARMInsnFixup bootloader_aarch64[] = { { 0x580000c0 }, /* ldr x0, arg ; Load the lower 32-bits of DTB */ { 0xaa1f03e1 }, /* mov x1, xzr */ @@ -150,9 +130,10 @@ static const ARMInsnFixup smpboot[] = { { 0, FIXUP_TERMINATOR } }; -static void write_bootloader(const char *name, hwaddr addr, - const ARMInsnFixup *insns, uint32_t *fixupcontext, - AddressSpace *as) +void arm_write_bootloader(const char *name, + AddressSpace *as, hwaddr addr, + const ARMInsnFixup *insns, + const uint32_t *fixupcontext) { /* Fix up the specified bootloader fragment and write it into * guest memory using rom_add_blob_fixed(). fixupcontext is @@ -214,8 +195,8 @@ static void default_write_secondary(ARMCPU *cpu, fixupcontext[FIXUP_DSB] = CP15_DSB_INSN; } - write_bootloader("smpboot", info->smp_loader_start, - smpboot, fixupcontext, as); + arm_write_bootloader("smpboot", as, info->smp_loader_start, + smpboot, fixupcontext); } void arm_write_secure_board_setup_dummy_smc(ARMCPU *cpu, @@ -1186,8 +1167,8 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, fixupcontext[FIXUP_ENTRYPOINT_LO] = entry; fixupcontext[FIXUP_ENTRYPOINT_HI] = entry >> 32; - write_bootloader("bootloader", info->loader_start, - primary_loader, fixupcontext, as); + arm_write_bootloader("bootloader", as, info->loader_start, + primary_loader, fixupcontext); if (info->write_board_setup) { info->write_board_setup(cpu, info); diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h index f18cc3064f..80c492d742 100644 --- a/include/hw/arm/boot.h +++ b/include/hw/arm/boot.h @@ -183,4 +183,53 @@ void arm_write_secure_board_setup_dummy_smc(ARMCPU *cpu, const struct arm_boot_info *info, hwaddr mvbar_addr); +typedef enum { + FIXUP_NONE = 0, /* do nothing */ + FIXUP_TERMINATOR, /* end of insns */ + FIXUP_BOARDID, /* overwrite with board ID number */ + FIXUP_BOARD_SETUP, /* overwrite with board specific setup code address */ + FIXUP_ARGPTR_LO, /* overwrite with pointer to kernel args */ + FIXUP_ARGPTR_HI, /* overwrite with pointer to kernel args (high half) */ + FIXUP_ENTRYPOINT_LO, /* overwrite with kernel entry point */ + FIXUP_ENTRYPOINT_HI, /* overwrite with kernel entry point (high half) */ + FIXUP_GIC_CPU_IF, /* overwrite with GIC CPU interface address */ + FIXUP_BOOTREG, /* overwrite with boot register address */ + FIXUP_DSB, /* overwrite with correct DSB insn for cpu */ + FIXUP_MAX, +} FixupType; + +typedef struct ARMInsnFixup { + uint32_t insn; + FixupType fixup; +} ARMInsnFixup; + +/** + * arm_write_bootloader - write a bootloader to guest memory + * @name: name of the bootloader blob + * @as: AddressSpace to write the bootloader + * @addr: guest address to write it + * @insns: the blob to be loaded + * @fixupcontext: context to be used for any fixups in @insns + * + * Write a bootloader to guest memory at address @addr in the address + * space @as. @name is the name to use for the resulting ROM blob, so + * it should be unique in the system and reasonably identifiable for debugging. + * + * @insns must be an array of ARMInsnFixup structs, each of which has + * one 32-bit value to be written to the guest memory, and a fixup to be + * applied to the value. FIXUP_NONE (do nothing) is value 0, so effectively + * the fixup is optional when writing a struct initializer. + * The final entry in the array must be { 0, FIXUP_TERMINATOR }. + * + * All other supported fixup types have the semantics "ignore insn + * and instead use the value from the array element @fixupcontext[fixup]". + * The caller should therefore provide @fixupcontext as an array of + * size FIXUP_MAX whose elements have been initialized for at least + * the entries that @insns refers to. + */ +void arm_write_bootloader(const char *name, + AddressSpace *as, hwaddr addr, + const ARMInsnFixup *insns, + const uint32_t *fixupcontext); + #endif /* HW_ARM_BOOT_H */ From 5477a21350355d557c7dc729782f90c6c71ba9a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 24 Apr 2023 16:27:16 +0100 Subject: [PATCH 12/76] hw/arm/aspeed: Use arm_write_bootloader() to write the bootloader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When writing the secondary-CPU stub boot loader code to the guest, use arm_write_bootloader() instead of directly calling rom_add_blob_fixed(). This fixes a bug on big-endian hosts, because arm_write_bootloader() will correctly byte-swap the host-byte-order array values into the guest-byte-order to write into the guest memory. Cc: qemu-stable@nongnu.org Signed-off-by: Cédric Le Goater Tested-by: Cédric Le Goater Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Peter Maydell Message-id: 20230424152717.1333930-3-peter.maydell@linaro.org [PMM: Moved the "make arm_write_bootloader() function public" part to its own patch; updated commit message to note that this fixes an actual bug; adjust to the API changes noted in previous commit] Signed-off-by: Peter Maydell (cherry picked from commit 902bba549fc386b4b9805320ed1a2e5b68478bdd) Signed-off-by: Michael Tokarev --- hw/arm/aspeed.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index c1f2b9cfca..0b29028fe1 100644 --- a/hw/arm/aspeed.c +++ b/hw/arm/aspeed.c @@ -200,33 +200,35 @@ struct AspeedMachineState { static void aspeed_write_smpboot(ARMCPU *cpu, const struct arm_boot_info *info) { - static const uint32_t poll_mailbox_ready[] = { + AddressSpace *as = arm_boot_address_space(cpu, info); + static const ARMInsnFixup poll_mailbox_ready[] = { /* * r2 = per-cpu go sign value * r1 = AST_SMP_MBOX_FIELD_ENTRY * r0 = AST_SMP_MBOX_FIELD_GOSIGN */ - 0xee100fb0, /* mrc p15, 0, r0, c0, c0, 5 */ - 0xe21000ff, /* ands r0, r0, #255 */ - 0xe59f201c, /* ldr r2, [pc, #28] */ - 0xe1822000, /* orr r2, r2, r0 */ + { 0xee100fb0 }, /* mrc p15, 0, r0, c0, c0, 5 */ + { 0xe21000ff }, /* ands r0, r0, #255 */ + { 0xe59f201c }, /* ldr r2, [pc, #28] */ + { 0xe1822000 }, /* orr r2, r2, r0 */ - 0xe59f1018, /* ldr r1, [pc, #24] */ - 0xe59f0018, /* ldr r0, [pc, #24] */ + { 0xe59f1018 }, /* ldr r1, [pc, #24] */ + { 0xe59f0018 }, /* ldr r0, [pc, #24] */ - 0xe320f002, /* wfe */ - 0xe5904000, /* ldr r4, [r0] */ - 0xe1520004, /* cmp r2, r4 */ - 0x1afffffb, /* bne */ - 0xe591f000, /* ldr pc, [r1] */ - AST_SMP_MBOX_GOSIGN, - AST_SMP_MBOX_FIELD_ENTRY, - AST_SMP_MBOX_FIELD_GOSIGN, + { 0xe320f002 }, /* wfe */ + { 0xe5904000 }, /* ldr r4, [r0] */ + { 0xe1520004 }, /* cmp r2, r4 */ + { 0x1afffffb }, /* bne */ + { 0xe591f000 }, /* ldr pc, [r1] */ + { AST_SMP_MBOX_GOSIGN }, + { AST_SMP_MBOX_FIELD_ENTRY }, + { AST_SMP_MBOX_FIELD_GOSIGN }, + { 0, FIXUP_TERMINATOR } }; + static const uint32_t fixupcontext[FIXUP_MAX] = { 0 }; - rom_add_blob_fixed("aspeed.smpboot", poll_mailbox_ready, - sizeof(poll_mailbox_ready), - info->smp_loader_start); + arm_write_bootloader("aspeed.smpboot", as, info->smp_loader_start, + poll_mailbox_ready, fixupcontext); } static void aspeed_reset_secondary(ARMCPU *cpu, From 975f12aa528d6cab5cc41efebaf05d7eb7296d94 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 24 Apr 2023 16:27:17 +0100 Subject: [PATCH 13/76] hw/arm/raspi: Use arm_write_bootloader() to write boot code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When writing the secondary-CPU stub boot loader code to the guest, use arm_write_bootloader() instead of directly calling rom_add_blob_fixed(). This fixes a bug on big-endian hosts, because arm_write_bootloader() will correctly byte-swap the host-byte-order array values into the guest-byte-order to write into the guest memory. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Tested-by: Cédric Le Goater Reviewed-by: Philippe Mathieu-Daudé Message-id: 20230424152717.1333930-4-peter.maydell@linaro.org (cherry picked from commit 0acbdb4c4ab6b0a09f159bae4899b0737cf64242) Signed-off-by: Michael Tokarev --- hw/arm/raspi.c | 64 +++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c index 92d068d1f9..a7d287b1a8 100644 --- a/hw/arm/raspi.c +++ b/hw/arm/raspi.c @@ -16,6 +16,7 @@ #include "qemu/units.h" #include "qemu/cutils.h" #include "qapi/error.h" +#include "hw/arm/boot.h" #include "hw/arm/bcm2836.h" #include "hw/registerfields.h" #include "qemu/error-report.h" @@ -124,20 +125,22 @@ static const char *board_type(uint32_t board_rev) static void write_smpboot(ARMCPU *cpu, const struct arm_boot_info *info) { - static const uint32_t smpboot[] = { - 0xe1a0e00f, /* mov lr, pc */ - 0xe3a0fe00 + (BOARDSETUP_ADDR >> 4), /* mov pc, BOARDSETUP_ADDR */ - 0xee100fb0, /* mrc p15, 0, r0, c0, c0, 5;get core ID */ - 0xe7e10050, /* ubfx r0, r0, #0, #2 ;extract LSB */ - 0xe59f5014, /* ldr r5, =0x400000CC ;load mbox base */ - 0xe320f001, /* 1: yield */ - 0xe7953200, /* ldr r3, [r5, r0, lsl #4] ;read mbox for our core*/ - 0xe3530000, /* cmp r3, #0 ;spin while zero */ - 0x0afffffb, /* beq 1b */ - 0xe7853200, /* str r3, [r5, r0, lsl #4] ;clear mbox */ - 0xe12fff13, /* bx r3 ;jump to target */ - 0x400000cc, /* (constant: mailbox 3 read/clear base) */ + static const ARMInsnFixup smpboot[] = { + { 0xe1a0e00f }, /* mov lr, pc */ + { 0xe3a0fe00 + (BOARDSETUP_ADDR >> 4) }, /* mov pc, BOARDSETUP_ADDR */ + { 0xee100fb0 }, /* mrc p15, 0, r0, c0, c0, 5;get core ID */ + { 0xe7e10050 }, /* ubfx r0, r0, #0, #2 ;extract LSB */ + { 0xe59f5014 }, /* ldr r5, =0x400000CC ;load mbox base */ + { 0xe320f001 }, /* 1: yield */ + { 0xe7953200 }, /* ldr r3, [r5, r0, lsl #4] ;read mbox for our core */ + { 0xe3530000 }, /* cmp r3, #0 ;spin while zero */ + { 0x0afffffb }, /* beq 1b */ + { 0xe7853200 }, /* str r3, [r5, r0, lsl #4] ;clear mbox */ + { 0xe12fff13 }, /* bx r3 ;jump to target */ + { 0x400000cc }, /* (constant: mailbox 3 read/clear base) */ + { 0, FIXUP_TERMINATOR } }; + static const uint32_t fixupcontext[FIXUP_MAX] = { 0 }; /* check that we don't overrun board setup vectors */ QEMU_BUILD_BUG_ON(SMPBOOT_ADDR + sizeof(smpboot) > MVBAR_ADDR); @@ -145,9 +148,8 @@ static void write_smpboot(ARMCPU *cpu, const struct arm_boot_info *info) QEMU_BUILD_BUG_ON((BOARDSETUP_ADDR & 0xf) != 0 || (BOARDSETUP_ADDR >> 4) >= 0x100); - rom_add_blob_fixed_as("raspi_smpboot", smpboot, sizeof(smpboot), - info->smp_loader_start, - arm_boot_address_space(cpu, info)); + arm_write_bootloader("raspi_smpboot", arm_boot_address_space(cpu, info), + info->smp_loader_start, smpboot, fixupcontext); } static void write_smpboot64(ARMCPU *cpu, const struct arm_boot_info *info) @@ -161,26 +163,28 @@ static void write_smpboot64(ARMCPU *cpu, const struct arm_boot_info *info) * the primary CPU goes into the kernel. We put these variables inside * a rom blob, so that the reset for ROM contents zeroes them for us. */ - static const uint32_t smpboot[] = { - 0xd2801b05, /* mov x5, 0xd8 */ - 0xd53800a6, /* mrs x6, mpidr_el1 */ - 0x924004c6, /* and x6, x6, #0x3 */ - 0xd503205f, /* spin: wfe */ - 0xf86678a4, /* ldr x4, [x5,x6,lsl #3] */ - 0xb4ffffc4, /* cbz x4, spin */ - 0xd2800000, /* mov x0, #0x0 */ - 0xd2800001, /* mov x1, #0x0 */ - 0xd2800002, /* mov x2, #0x0 */ - 0xd2800003, /* mov x3, #0x0 */ - 0xd61f0080, /* br x4 */ + static const ARMInsnFixup smpboot[] = { + { 0xd2801b05 }, /* mov x5, 0xd8 */ + { 0xd53800a6 }, /* mrs x6, mpidr_el1 */ + { 0x924004c6 }, /* and x6, x6, #0x3 */ + { 0xd503205f }, /* spin: wfe */ + { 0xf86678a4 }, /* ldr x4, [x5,x6,lsl #3] */ + { 0xb4ffffc4 }, /* cbz x4, spin */ + { 0xd2800000 }, /* mov x0, #0x0 */ + { 0xd2800001 }, /* mov x1, #0x0 */ + { 0xd2800002 }, /* mov x2, #0x0 */ + { 0xd2800003 }, /* mov x3, #0x0 */ + { 0xd61f0080 }, /* br x4 */ + { 0, FIXUP_TERMINATOR } }; + static const uint32_t fixupcontext[FIXUP_MAX] = { 0 }; static const uint64_t spintables[] = { 0, 0, 0, 0 }; - rom_add_blob_fixed_as("raspi_smpboot", smpboot, sizeof(smpboot), - info->smp_loader_start, as); + arm_write_bootloader("raspi_smpboot", as, info->smp_loader_start, + smpboot, fixupcontext); rom_add_blob_fixed_as("raspi_spintables", spintables, sizeof(spintables), SPINTABLE_ADDR, as); } From af08c70ef5204fedb2b974fbecaf65e1b6cc0a2f Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 24 Apr 2023 16:28:33 +0100 Subject: [PATCH 14/76] hw/intc/allwinner-a10-pic: Don't use set_bit()/clear_bit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Allwinner PIC model uses set_bit() and clear_bit() to update the values in its irq_pending[] array when an interrupt arrives. However it is using these functions wrongly: they work on an array of type 'long', and it is passing an array of type 'uint32_t'. Because the code manually figures out the right array element, this works on little-endian hosts and on 32-bit big-endian hosts, where bits 0..31 in a 'long' are in the same place as they are in a 'uint32_t'. However it breaks on 64-bit big-endian hosts. Remove the use of set_bit() and clear_bit() in favour of using deposit32() on the array element. This fixes a bug where on big-endian 64-bit hosts the guest kernel would hang early on in bootup. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Thomas Huth Reviewed-by: Philippe Mathieu-Daudé Message-id: 20230424152833.1334136-1-peter.maydell@linaro.org (cherry picked from commit 2c5fa0778c3b4307f9f3af7f27886c46d129c62f) Signed-off-by: Michael Tokarev --- hw/intc/allwinner-a10-pic.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/hw/intc/allwinner-a10-pic.c b/hw/intc/allwinner-a10-pic.c index 8cca124807..4875e68ba6 100644 --- a/hw/intc/allwinner-a10-pic.c +++ b/hw/intc/allwinner-a10-pic.c @@ -49,12 +49,9 @@ static void aw_a10_pic_update(AwA10PICState *s) static void aw_a10_pic_set_irq(void *opaque, int irq, int level) { AwA10PICState *s = opaque; + uint32_t *pending_reg = &s->irq_pending[irq / 32]; - if (level) { - set_bit(irq % 32, (void *)&s->irq_pending[irq / 32]); - } else { - clear_bit(irq % 32, (void *)&s->irq_pending[irq / 32]); - } + *pending_reg = deposit32(*pending_reg, irq % 32, 1, level); aw_a10_pic_update(s); } From 6944823a6f56ee4e4db557ce77a6d44d5d78ffaf Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 24 Apr 2023 16:39:08 +0100 Subject: [PATCH 15/76] target/arm: Define and use new load_cpu_field_low32() In several places in the 32-bit Arm translate.c, we try to use load_cpu_field() to load from a CPUARMState field into a TCGv_i32 where the field is actually 64-bit. This works on little-endian hosts, but gives the wrong half of the register on big-endian. Add a new load_cpu_field_low32() which loads the low 32 bits of a 64-bit field into a TCGv_i32. The new macro includes a compile-time check against accidentally using it on a field of the wrong size. Use it to fix the two places in the code where we were using load_cpu_field() on a 64-bit field. This fixes a bug where on big-endian hosts the guest would crash after executing an ERET instruction, and a more corner case one where some UNDEFs for attempted accesses to MSR banked registers from Secure EL1 might go to the wrong EL. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20230424153909.1419369-2-peter.maydell@linaro.org (cherry picked from commit 7f3a3d3dc433dc06c0adb480729af80f9c8e3739) Signed-off-by: Michael Tokarev --- target/arm/tcg/translate.c | 4 ++-- target/arm/translate-a32.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index 3c8401e908..7468476724 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -2816,7 +2816,7 @@ static bool msr_banked_access_decode(DisasContext *s, int r, int sysm, int rn, if (arm_dc_feature(s, ARM_FEATURE_AARCH64) && dc_isar_feature(aa64_sel2, s)) { /* Target EL is EL<3 minus SCR_EL3.EEL2> */ - tcg_el = load_cpu_field(cp15.scr_el3); + tcg_el = load_cpu_field_low32(cp15.scr_el3); tcg_gen_sextract_i32(tcg_el, tcg_el, ctz32(SCR_EEL2), 1); tcg_gen_addi_i32(tcg_el, tcg_el, 3); } else { @@ -6396,7 +6396,7 @@ static bool trans_ERET(DisasContext *s, arg_ERET *a) } if (s->current_el == 2) { /* ERET from Hyp uses ELR_Hyp, not LR */ - tmp = load_cpu_field(elr_el[2]); + tmp = load_cpu_field_low32(elr_el[2]); } else { tmp = load_reg(s, 14); } diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h index 5339c22f1e..99eea85fa8 100644 --- a/target/arm/translate-a32.h +++ b/target/arm/translate-a32.h @@ -61,6 +61,13 @@ static inline TCGv_i32 load_cpu_offset(int offset) #define load_cpu_field(name) load_cpu_offset(offsetof(CPUARMState, name)) +/* Load from the low half of a 64-bit field to a TCGv_i32 */ +#define load_cpu_field_low32(name) \ + ({ \ + QEMU_BUILD_BUG_ON(sizeof_field(CPUARMState, name) != 8); \ + load_cpu_offset(offsetoflow32(CPUARMState, name)); \ + }) + void store_cpu_offset(TCGv_i32 var, int offset, int size); #define store_cpu_field(var, name) \ From 2daa9e4d7e5797dfe191f5e1edc52fe3c1d9ca44 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 24 Apr 2023 17:50:52 +0100 Subject: [PATCH 16/76] hw/sd/allwinner-sdhost: Correctly byteswap descriptor fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In allwinner_sdhost_process_desc() we just read directly from guest memory into a host TransferDescriptor struct and back. This only works on little-endian hosts. Abstract the reading and writing of descriptors into functions that handle the byte-swapping so that TransferDescriptor structs as seen by the rest of the code are always in host-order. This fixes a failure of one of the avocado tests on s390. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Thomas Huth Reviewed-by: Alex Bennée Reviewed-by: Philippe Mathieu-Daudé Message-id: 20230424165053.1428857-2-peter.maydell@linaro.org (cherry picked from commit 3e20d90824c262de6887aa1bc52af94db69e4310) Signed-off-by: Michael Tokarev --- hw/sd/allwinner-sdhost.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/hw/sd/allwinner-sdhost.c b/hw/sd/allwinner-sdhost.c index 51e5e90830..92a0f42708 100644 --- a/hw/sd/allwinner-sdhost.c +++ b/hw/sd/allwinner-sdhost.c @@ -302,6 +302,30 @@ static void allwinner_sdhost_auto_stop(AwSdHostState *s) } } +static void read_descriptor(AwSdHostState *s, hwaddr desc_addr, + TransferDescriptor *desc) +{ + uint32_t desc_words[4]; + dma_memory_read(&s->dma_as, desc_addr, &desc_words, sizeof(desc_words), + MEMTXATTRS_UNSPECIFIED); + desc->status = le32_to_cpu(desc_words[0]); + desc->size = le32_to_cpu(desc_words[1]); + desc->addr = le32_to_cpu(desc_words[2]); + desc->next = le32_to_cpu(desc_words[3]); +} + +static void write_descriptor(AwSdHostState *s, hwaddr desc_addr, + const TransferDescriptor *desc) +{ + uint32_t desc_words[4]; + desc_words[0] = cpu_to_le32(desc->status); + desc_words[1] = cpu_to_le32(desc->size); + desc_words[2] = cpu_to_le32(desc->addr); + desc_words[3] = cpu_to_le32(desc->next); + dma_memory_write(&s->dma_as, desc_addr, &desc_words, sizeof(desc_words), + MEMTXATTRS_UNSPECIFIED); +} + static uint32_t allwinner_sdhost_process_desc(AwSdHostState *s, hwaddr desc_addr, TransferDescriptor *desc, @@ -312,9 +336,7 @@ static uint32_t allwinner_sdhost_process_desc(AwSdHostState *s, uint32_t num_bytes = max_bytes; uint8_t buf[1024]; - /* Read descriptor */ - dma_memory_read(&s->dma_as, desc_addr, desc, sizeof(*desc), - MEMTXATTRS_UNSPECIFIED); + read_descriptor(s, desc_addr, desc); if (desc->size == 0) { desc->size = klass->max_desc_size; } else if (desc->size > klass->max_desc_size) { @@ -356,8 +378,7 @@ static uint32_t allwinner_sdhost_process_desc(AwSdHostState *s, /* Clear hold flag and flush descriptor */ desc->status &= ~DESC_STATUS_HOLD; - dma_memory_write(&s->dma_as, desc_addr, desc, sizeof(*desc), - MEMTXATTRS_UNSPECIFIED); + write_descriptor(s, desc_addr, desc); return num_done; } From f6227dd60d75f8a709e70263478677ae4f3f1172 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 24 Apr 2023 17:50:53 +0100 Subject: [PATCH 17/76] hw/net/allwinner-sun8i-emac: Correctly byteswap descriptor fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In allwinner-sun8i-emac we just read directly from guest memory into a host FrameDescriptor struct and back. This only works on little-endian hosts. Reading and writing of descriptors is already abstracted into functions; make those functions also handle the byte-swapping so that TransferDescriptor structs as seen by the rest of the code are always in host-order, and fix two places that were doing ad-hoc descriptor reading without using the functions. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Thomas Huth Reviewed-by: Alex Bennée Reviewed-by: Philippe Mathieu-Daudé Message-id: 20230424165053.1428857-3-peter.maydell@linaro.org (cherry picked from commit a4ae17e5ec512862bf73e40dfbb1e7db71f2c1e7) Signed-off-by: Michael Tokarev --- hw/net/allwinner-sun8i-emac.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c index b861d8ff35..fac4405f45 100644 --- a/hw/net/allwinner-sun8i-emac.c +++ b/hw/net/allwinner-sun8i-emac.c @@ -350,8 +350,13 @@ static void allwinner_sun8i_emac_get_desc(AwSun8iEmacState *s, FrameDescriptor *desc, uint32_t phys_addr) { - dma_memory_read(&s->dma_as, phys_addr, desc, sizeof(*desc), + uint32_t desc_words[4]; + dma_memory_read(&s->dma_as, phys_addr, &desc_words, sizeof(desc_words), MEMTXATTRS_UNSPECIFIED); + desc->status = le32_to_cpu(desc_words[0]); + desc->status2 = le32_to_cpu(desc_words[1]); + desc->addr = le32_to_cpu(desc_words[2]); + desc->next = le32_to_cpu(desc_words[3]); } static uint32_t allwinner_sun8i_emac_next_desc(AwSun8iEmacState *s, @@ -400,10 +405,15 @@ static uint32_t allwinner_sun8i_emac_tx_desc(AwSun8iEmacState *s, } static void allwinner_sun8i_emac_flush_desc(AwSun8iEmacState *s, - FrameDescriptor *desc, + const FrameDescriptor *desc, uint32_t phys_addr) { - dma_memory_write(&s->dma_as, phys_addr, desc, sizeof(*desc), + uint32_t desc_words[4]; + desc_words[0] = cpu_to_le32(desc->status); + desc_words[1] = cpu_to_le32(desc->status2); + desc_words[2] = cpu_to_le32(desc->addr); + desc_words[3] = cpu_to_le32(desc->next); + dma_memory_write(&s->dma_as, phys_addr, &desc_words, sizeof(desc_words), MEMTXATTRS_UNSPECIFIED); } @@ -638,8 +648,7 @@ static uint64_t allwinner_sun8i_emac_read(void *opaque, hwaddr offset, break; case REG_TX_CUR_BUF: /* Transmit Current Buffer */ if (s->tx_desc_curr != 0) { - dma_memory_read(&s->dma_as, s->tx_desc_curr, &desc, sizeof(desc), - MEMTXATTRS_UNSPECIFIED); + allwinner_sun8i_emac_get_desc(s, &desc, s->tx_desc_curr); value = desc.addr; } else { value = 0; @@ -652,8 +661,7 @@ static uint64_t allwinner_sun8i_emac_read(void *opaque, hwaddr offset, break; case REG_RX_CUR_BUF: /* Receive Current Buffer */ if (s->rx_desc_curr != 0) { - dma_memory_read(&s->dma_as, s->rx_desc_curr, &desc, sizeof(desc), - MEMTXATTRS_UNSPECIFIED); + allwinner_sun8i_emac_get_desc(s, &desc, s->rx_desc_curr); value = desc.addr; } else { value = 0; From a458252c16b26a783aef22767c2a35872b72885d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 2 May 2023 17:11:19 -0400 Subject: [PATCH 18/76] block/export: call blk_set_dev_ops(blk, NULL, NULL) Most export types install BlockDeviceOps pointers. It is easy to forget to remove them because that happens automatically via the "drive" qdev property in hw/ but not block/export/. Put blk_set_dev_ops(blk, NULL, NULL) calls in the core export.c code so the export types don't need to remember. This fixes the nbd and vhost-user-blk export types. Fixes: fd6afc501a01 ("nbd/server: Use drained block ops to quiesce the server") Fixes: ca858a5fe94c ("vhost-user-blk-server: notify client about disk resize") Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Message-Id: <20230502211119.720647-1-stefanha@redhat.com> Signed-off-by: Eric Blake (cherry picked from commit de79b52604e43fdeba6cee4f5af600b62169f2d2) Signed-off-by: Michael Tokarev --- block/export/export.c | 2 ++ block/export/vduse-blk.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/block/export/export.c b/block/export/export.c index 28a91c9c42..cdef298902 100644 --- a/block/export/export.c +++ b/block/export/export.c @@ -192,6 +192,7 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp) return exp; fail: + blk_set_dev_ops(exp->blk, NULL, NULL); blk_unref(blk); aio_context_release(ctx); if (exp) { @@ -219,6 +220,7 @@ static void blk_exp_delete_bh(void *opaque) assert(exp->refcount == 0); QLIST_REMOVE(exp, next); exp->drv->delete(exp); + blk_set_dev_ops(exp->blk, NULL, NULL); blk_unref(exp->blk); qapi_event_send_block_export_deleted(exp->id); g_free(exp->id); diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c index f7ae44e3ce..b53ef39da0 100644 --- a/block/export/vduse-blk.c +++ b/block/export/vduse-blk.c @@ -346,7 +346,6 @@ static void vduse_blk_exp_delete(BlockExport *exp) blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, vblk_exp); - blk_set_dev_ops(exp->blk, NULL, NULL); ret = vduse_dev_destroy(vblk_exp->dev); if (ret != -EBUSY) { unlink(vblk_exp->recon_file); From 4dc5df865c482c6e8894964c7f300fa556c3b78e Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Tue, 2 May 2023 20:55:30 +0530 Subject: [PATCH 19/76] softfloat: Fix the incorrect computation in float32_exp2 The float32_exp2 function is computing wrong exponent of 2. For example, with the following set of values {0.1, 2.0, 2.0, -1.0}, the expected output would be {1.071773, 4.000000, 4.000000, 0.500000}. Instead, the function is computing {1.119102, 3.382044, 3.382044, -0.191022} Looking at the code, the float32_exp2() attempts to do this 2 3 4 5 n x x x x x x x e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 1! 2! 3! 4! 5! n! But because of the typo it ends up doing x x x x x x x e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 1! 2! 3! 4! 5! n! This is because instead of the xnp which holds the numerator, parts_muladd is using the xp which is just 'x'. Commit '572c4d862ff2' refactored this function, and mistakenly used xp instead of xnp. Cc: qemu-stable@nongnu.org Fixes: 572c4d862ff2 "softfloat: Convert float32_exp2 to FloatParts" Partially-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1623 Reported-By: Luca Barbato (https://gitlab.com/lu-zero) Signed-off-by: Shivaprasad G Bhat Signed-off-by: Vaibhav Jain Message-Id: <168304110865.537992.13059030916325018670.stgit@localhost.localdomain> Reviewed-by: Richard Henderson Signed-off-by: Richard Henderson (cherry picked from commit 1098cc3fcf952763fc9fd72c1c8fda30a18cc8ea) Signed-off-by: Michael Tokarev --- fpu/softfloat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index c7454c3eb1..108f9cb224 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -5135,7 +5135,7 @@ float32 float32_exp2(float32 a, float_status *status) float64_unpack_canonical(&rp, float64_one, status); for (i = 0 ; i < 15 ; i++) { float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status); - rp = *parts_muladd(&tp, &xp, &rp, 0, status); + rp = *parts_muladd(&tp, &xnp, &rp, 0, status); xnp = *parts_mul(&xnp, &xp, status); } From 8322e5300fb5fa7fbbe27b1882263159f24c5ae9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 May 2023 10:20:46 +0200 Subject: [PATCH 20/76] meson: leave unnecessary modules out of the build meson.build files choose whether to build modules based on foo.found() expressions. If a feature is enabled (e.g. --enable-gtk), these expressions are true even if the code is not used by any emulator, and this results in an unexpected difference between modular and non-modular builds. For non-modular builds, the files are not included in any binary, and therefore the source files are never processed. For modular builds, however, all .so files are unconditionally built by default, and therefore a normal "make" tries to build them. However, the corresponding trace-*.h files are absent due to this conditional: if have_system trace_events_subdirs += [ ... 'ui', ... ] endif which was added to avoid wasting time running tracetool on unused trace-events files. This causes a compilation failure; fix it by skipping module builds entirely if (depending on the module directory) have_block or have_system are false. Reported-by: Michael Tokarev Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini (cherry picked from commit ef709860ea12ec59c4cd7373bd2fd7a4e50143ee) Signed-off-by: Michael Tokarev --- meson.build | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/meson.build b/meson.build index c44d05a13f..c7e486e087 100644 --- a/meson.build +++ b/meson.build @@ -3213,6 +3213,10 @@ modinfo_files = [] block_mods = [] softmmu_mods = [] foreach d, list : modules + if not (d == 'block' ? have_block : have_system) + continue + endif + foreach m, module_ss : list if enable_modules and targetos != 'windows' module_ss = module_ss.apply(config_all, strict: false) From 2197a94cb461c3bfff9d183ae79d4c99964163c4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 3 May 2023 16:01:42 +0200 Subject: [PATCH 21/76] block: Fix use after free in blockdev_mark_auto_del() job_cancel_locked() drops the job list lock temporarily and it may call aio_poll(). We must assume that the list has changed after this call. Also, with unlucky timing, it can end up freeing the job during job_completed_txn_abort_locked(), making the job pointer invalid, too. For both reasons, we can't just continue at block_job_next_locked(job). Instead, start at the head of the list again after job_cancel_locked() and skip those jobs that we already cancelled (or that are completing anyway). Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Message-Id: <20230503140142.474404-1-kwolf@redhat.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf (cherry picked from commit e2626874a32602d4e52971c786ef5ffb4430629d) Signed-off-by: Michael Tokarev --- blockdev.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/blockdev.c b/blockdev.c index d7b5c18f0a..2c1752a403 100644 --- a/blockdev.c +++ b/blockdev.c @@ -153,12 +153,22 @@ void blockdev_mark_auto_del(BlockBackend *blk) JOB_LOCK_GUARD(); - for (job = block_job_next_locked(NULL); job; - job = block_job_next_locked(job)) { - if (block_job_has_bdrv(job, blk_bs(blk))) { + do { + job = block_job_next_locked(NULL); + while (job && (job->job.cancelled || + job->job.deferred_to_main_loop || + !block_job_has_bdrv(job, blk_bs(blk)))) + { + job = block_job_next_locked(job); + } + if (job) { + /* + * This drops the job lock temporarily and polls, so we need to + * restart processing the list from the start after this. + */ job_cancel_locked(&job->job, false); } - } + } while (job); dinfo->auto_del = 1; } From 38a598aee3c7d716524f1f7c75e9299a879fcc65 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 4 May 2023 13:57:32 +0200 Subject: [PATCH 22/76] block: Consistently call bdrv_activate() outside coroutine Migration code can call bdrv_activate() in coroutine context, whereas other callers call it outside of coroutines. As it calls other code that is not supposed to run in coroutines, standardise on running outside of coroutines. This adds a no_co_wrapper to switch to the main loop before calling bdrv_activate(). Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-Id: <20230504115750.54437-3-kwolf@redhat.com> Signed-off-by: Kevin Wolf (cherry picked from commit da4afaff074e56b0fa0d25abf865784148018895) Signed-off-by: Michael Tokarev --- block/block-backend.c | 10 +++++++++- include/block/block-global-state.h | 6 +++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 55efc735b4..d59f759daf 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2018,7 +2018,15 @@ void blk_activate(BlockBackend *blk, Error **errp) return; } - bdrv_activate(bs, errp); + /* + * Migration code can call this function in coroutine context, so leave + * coroutine context if necessary. + */ + if (qemu_in_coroutine()) { + bdrv_co_activate(bs, errp); + } else { + bdrv_activate(bs, errp); + } } bool coroutine_fn blk_co_is_inserted(BlockBackend *blk) diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index 399200a9a3..2c312cc774 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -166,7 +166,11 @@ int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp); -int bdrv_activate(BlockDriverState *bs, Error **errp); +int no_coroutine_fn bdrv_activate(BlockDriverState *bs, Error **errp); + +int coroutine_fn no_co_wrapper +bdrv_co_activate(BlockDriverState *bs, Error **errp); + void bdrv_activate_all(Error **errp); int bdrv_inactivate_all(void); From e0deae4f49c4b4f0e3f74d716d4bbe0f81390e05 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 4 May 2023 13:57:33 +0200 Subject: [PATCH 23/76] block: bdrv/blk_co_unref() for calls in coroutine context These functions must not be called in coroutine context, because they need write access to the graph. Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-Id: <20230504115750.54437-4-kwolf@redhat.com> Signed-off-by: Kevin Wolf (cherry picked from commit b2ab5f545fa1eaaf2955dd617bee19a8b3279786) Signed-off-by: Michael Tokarev --- block.c | 2 +- block/crypto.c | 6 +++--- block/parallels.c | 6 +++--- block/qcow.c | 6 +++--- block/qcow2.c | 14 +++++++------- block/qed.c | 6 +++--- block/vdi.c | 6 +++--- block/vhdx.c | 6 +++--- block/vmdk.c | 18 +++++++++--------- block/vpc.c | 6 +++--- include/block/block-global-state.h | 3 ++- include/sysemu/block-backend-global-state.h | 5 ++++- 12 files changed, 44 insertions(+), 40 deletions(-) diff --git a/block.c b/block.c index d79a52ca74..a48112f945 100644 --- a/block.c +++ b/block.c @@ -680,7 +680,7 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, ret = 0; out: - blk_unref(blk); + blk_co_unref(blk); return ret; } diff --git a/block/crypto.c b/block/crypto.c index ca67289187..8fd3ad0054 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -355,7 +355,7 @@ block_crypto_co_create_generic(BlockDriverState *bs, int64_t size, ret = 0; cleanup: qcrypto_block_free(crypto); - blk_unref(blk); + blk_co_unref(blk); return ret; } @@ -661,7 +661,7 @@ block_crypto_co_create_luks(BlockdevCreateOptions *create_options, Error **errp) ret = 0; fail: - bdrv_unref(bs); + bdrv_co_unref(bs); return ret; } @@ -730,7 +730,7 @@ fail: bdrv_co_delete_file_noerr(bs); } - bdrv_unref(bs); + bdrv_co_unref(bs); qapi_free_QCryptoBlockCreateOptions(create_opts); qobject_unref(cryptoopts); return ret; diff --git a/block/parallels.c b/block/parallels.c index 013684801a..b49c35929e 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -613,8 +613,8 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts, ret = 0; out: - blk_unref(blk); - bdrv_unref(bs); + blk_co_unref(blk); + bdrv_co_unref(bs); return ret; exit: @@ -691,7 +691,7 @@ parallels_co_create_opts(BlockDriver *drv, const char *filename, done: qobject_unref(qdict); - bdrv_unref(bs); + bdrv_co_unref(bs); qapi_free_BlockdevCreateOptions(create_options); return ret; } diff --git a/block/qcow.c b/block/qcow.c index 490e4f819e..a0c701f578 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -915,8 +915,8 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts, g_free(tmp); ret = 0; exit: - blk_unref(qcow_blk); - bdrv_unref(bs); + blk_co_unref(qcow_blk); + bdrv_co_unref(bs); qcrypto_block_free(crypto); return ret; } @@ -1015,7 +1015,7 @@ qcow_co_create_opts(BlockDriver *drv, const char *filename, fail: g_free(backing_fmt); qobject_unref(qdict); - bdrv_unref(bs); + bdrv_co_unref(bs); qapi_free_BlockdevCreateOptions(create_options); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index 30fd53fa64..6746763c34 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3705,7 +3705,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) goto out; } - blk_unref(blk); + blk_co_unref(blk); blk = NULL; /* @@ -3785,7 +3785,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) } } - blk_unref(blk); + blk_co_unref(blk); blk = NULL; /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning. @@ -3810,9 +3810,9 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) ret = 0; out: - blk_unref(blk); - bdrv_unref(bs); - bdrv_unref(data_bs); + blk_co_unref(blk); + bdrv_co_unref(bs); + bdrv_co_unref(data_bs); return ret; } @@ -3943,8 +3943,8 @@ finish: } qobject_unref(qdict); - bdrv_unref(bs); - bdrv_unref(data_bs); + bdrv_co_unref(bs); + bdrv_co_unref(data_bs); qapi_free_BlockdevCreateOptions(create_options); return ret; } diff --git a/block/qed.c b/block/qed.c index 0705a7b4e2..aff2a2076e 100644 --- a/block/qed.c +++ b/block/qed.c @@ -748,8 +748,8 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts, ret = 0; /* success */ out: g_free(l1_table); - blk_unref(blk); - bdrv_unref(bs); + blk_co_unref(blk); + bdrv_co_unref(bs); return ret; } @@ -819,7 +819,7 @@ bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename, fail: qobject_unref(qdict); - bdrv_unref(bs); + bdrv_co_unref(bs); qapi_free_BlockdevCreateOptions(create_options); return ret; } diff --git a/block/vdi.c b/block/vdi.c index f2434d6153..08331d2dd7 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -886,8 +886,8 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options, ret = 0; exit: - blk_unref(blk); - bdrv_unref(bs_file); + blk_co_unref(blk); + bdrv_co_unref(bs_file); g_free(bmap); return ret; } @@ -975,7 +975,7 @@ vdi_co_create_opts(BlockDriver *drv, const char *filename, done: qobject_unref(qdict); qapi_free_BlockdevCreateOptions(create_options); - bdrv_unref(bs_file); + bdrv_co_unref(bs_file); return ret; } diff --git a/block/vhdx.c b/block/vhdx.c index 81420722a1..00777da91a 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -2053,8 +2053,8 @@ static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts, ret = 0; delete_and_exit: - blk_unref(blk); - bdrv_unref(bs); + blk_co_unref(blk); + bdrv_co_unref(bs); g_free(creator); return ret; } @@ -2144,7 +2144,7 @@ vhdx_co_create_opts(BlockDriver *drv, const char *filename, fail: qobject_unref(qdict); - bdrv_unref(bs); + bdrv_co_unref(bs); qapi_free_BlockdevCreateOptions(create_options); return ret; } diff --git a/block/vmdk.c b/block/vmdk.c index f5f49018fe..01ca13c82b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -2306,7 +2306,7 @@ exit: if (pbb) { *pbb = blk; } else { - blk_unref(blk); + blk_co_unref(blk); blk = NULL; } } @@ -2516,12 +2516,12 @@ vmdk_co_do_create(int64_t size, if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) { error_setg(errp, "Invalid backing file format: %s. Must be vmdk", blk_bs(backing)->drv->format_name); - blk_unref(backing); + blk_co_unref(backing); ret = -EINVAL; goto exit; } ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid); - blk_unref(backing); + blk_co_unref(backing); if (ret) { error_setg(errp, "Failed to read parent CID"); goto exit; @@ -2542,14 +2542,14 @@ vmdk_co_do_create(int64_t size, blk_bs(extent_blk)->filename); created_size += cur_size; extent_idx++; - blk_unref(extent_blk); + blk_co_unref(extent_blk); } /* Check whether we got excess extents */ extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain, opaque, NULL); if (extent_blk) { - blk_unref(extent_blk); + blk_co_unref(extent_blk); error_setg(errp, "List of extents contains unused extents"); ret = -EINVAL; goto exit; @@ -2590,7 +2590,7 @@ vmdk_co_do_create(int64_t size, ret = 0; exit: if (blk) { - blk_unref(blk); + blk_co_unref(blk); } g_free(desc); g_free(parent_desc_line); @@ -2641,7 +2641,7 @@ vmdk_co_create_opts_cb(int64_t size, int idx, bool flat, bool split, errp)) { goto exit; } - bdrv_unref(bs); + bdrv_co_unref(bs); exit: g_free(ext_filename); return blk; @@ -2797,12 +2797,12 @@ static BlockBackend * coroutine_fn vmdk_co_create_cb(int64_t size, int idx, return NULL; } blk_set_allow_write_beyond_eof(blk, true); - bdrv_unref(bs); + bdrv_co_unref(bs); if (size != -1) { ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp); if (ret) { - blk_unref(blk); + blk_co_unref(blk); blk = NULL; } } diff --git a/block/vpc.c b/block/vpc.c index b89b0ff8e2..07ddda5b99 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -1082,8 +1082,8 @@ static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts, } out: - blk_unref(blk); - bdrv_unref(bs); + blk_co_unref(blk); + bdrv_co_unref(bs); return ret; } @@ -1162,7 +1162,7 @@ vpc_co_create_opts(BlockDriver *drv, const char *filename, fail: qobject_unref(qdict); - bdrv_unref(bs); + bdrv_co_unref(bs); qapi_free_BlockdevCreateOptions(create_options); return ret; } diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index 2c312cc774..ec3ddb17a8 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -218,7 +218,8 @@ void bdrv_img_create(const char *filename, const char *fmt, bool quiet, Error **errp); void bdrv_ref(BlockDriverState *bs); -void bdrv_unref(BlockDriverState *bs); +void no_coroutine_fn bdrv_unref(BlockDriverState *bs); +void coroutine_fn no_co_wrapper bdrv_co_unref(BlockDriverState *bs); void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child); BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h index 2b6d27db7c..fa83f9389c 100644 --- a/include/sysemu/block-backend-global-state.h +++ b/include/sysemu/block-backend-global-state.h @@ -42,7 +42,10 @@ blk_co_new_open(const char *filename, const char *reference, QDict *options, int blk_get_refcnt(BlockBackend *blk); void blk_ref(BlockBackend *blk); -void blk_unref(BlockBackend *blk); + +void no_coroutine_fn blk_unref(BlockBackend *blk); +void coroutine_fn no_co_wrapper blk_co_unref(BlockBackend *blk); + void blk_remove_all_bs(void); BlockBackend *blk_by_name(const char *name); BlockBackend *blk_next(BlockBackend *blk); From 3b02d0db4a7424324afbec51e810b837e28b55f8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 4 May 2023 13:57:34 +0200 Subject: [PATCH 24/76] block: Don't call no_coroutine_fns in qmp_block_resize() This QMP handler runs in a coroutine, so it must use the corresponding no_co_wrappers instead. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2185688 Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Message-Id: <20230504115750.54437-5-kwolf@redhat.com> Signed-off-by: Kevin Wolf (cherry picked from commit 0c7d204f50c382c6baac8c94bd57af4a022b3888) Signed-off-by: Michael Tokarev --- blockdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index 2c1752a403..e464daea58 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2440,7 +2440,7 @@ void coroutine_fn qmp_block_resize(const char *device, const char *node_name, return; } - blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp); + blk = blk_co_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp); if (!blk) { return; } @@ -2455,7 +2455,7 @@ void coroutine_fn qmp_block_resize(const char *device, const char *node_name, bdrv_co_lock(bs); bdrv_drained_end(bs); - blk_unref(blk); + blk_co_unref(blk); bdrv_co_unlock(bs); } From f5301431e811fbc45c6c0e8ab8c2965f8134cfd7 Mon Sep 17 00:00:00 2001 From: LIU Zhiwei Date: Fri, 24 Mar 2023 14:40:11 +0800 Subject: [PATCH 25/76] target/riscv: Fix itrigger when icount is used When I boot a ubuntu image, QEMU output a "Bad icount read" message and exit. The reason is that when execute helper_mret or helper_sret, it will cause a call to icount_get_raw_locked (), which needs set can_do_io flag on cpustate. Thus we setting this flag when execute these two instructions. Signed-off-by: LIU Zhiwei Reviewed-by: Weiwei Li Acked-by: Alistair Francis Message-Id: <20230324064011.976-1-zhiwei_liu@linux.alibaba.com> Signed-off-by: Alistair Francis (cherry picked from commit df3ac6da476e346a17bad5bc843de1135a269229) Signed-off-by: Michael Tokarev --- target/riscv/insn_trans/trans_privileged.c.inc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc index 59501b2780..e3bee971c6 100644 --- a/target/riscv/insn_trans/trans_privileged.c.inc +++ b/target/riscv/insn_trans/trans_privileged.c.inc @@ -77,6 +77,9 @@ static bool trans_sret(DisasContext *ctx, arg_sret *a) #ifndef CONFIG_USER_ONLY if (has_ext(ctx, RVS)) { decode_save_opc(ctx); + if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) { + gen_io_start(); + } gen_helper_sret(cpu_pc, cpu_env); exit_tb(ctx); /* no chaining */ ctx->base.is_jmp = DISAS_NORETURN; @@ -93,6 +96,9 @@ static bool trans_mret(DisasContext *ctx, arg_mret *a) { #ifndef CONFIG_USER_ONLY decode_save_opc(ctx); + if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) { + gen_io_start(); + } gen_helper_mret(cpu_pc, cpu_env); exit_tb(ctx); /* no chaining */ ctx->base.is_jmp = DISAS_NORETURN; From f91d0db71ed850d0125b8ed3ba8bbe0759a62767 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Mon, 17 Apr 2023 12:30:54 +0800 Subject: [PATCH 26/76] target/riscv: Restore the predicate() NULL check behavior When reading a non-existent CSR QEMU should raise illegal instruction exception, but currently it just exits due to the g_assert() check. This actually reverts commit 0ee342256af9205e7388efdf193a6d8f1ba1a617. Some comments are also added to indicate that predicate() must be provided for an implemented CSR. Reported-by: Fei Wu Signed-off-by: Bin Meng Reviewed-by: Daniel Henrique Barboza Reviewed-by: Weiwei Li Reviewed-by: Alistair Francis Reviewed-by: LIU Zhiwei Message-Id: <20230417043054.3125614-1-bmeng@tinylab.org> Signed-off-by: Alistair Francis (cherry picked from commit eae04c4c131a8d95087c8568eb2cac1988262f25) (mjt: context edit after ce3af0bbbcdfa "target/riscv: add support for Zcmt extension") Signed-off-by: Michael Tokarev --- target/riscv/csr.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/target/riscv/csr.c b/target/riscv/csr.c index d522efc0b6..736ab64275 100644 --- a/target/riscv/csr.c +++ b/target/riscv/csr.c @@ -3797,6 +3797,11 @@ static inline RISCVException riscv_csrrw_check(CPURISCVState *env, return RISCV_EXCP_ILLEGAL_INST; } + /* ensure CSR is implemented by checking predicate */ + if (!csr_ops[csrno].predicate) { + return RISCV_EXCP_ILLEGAL_INST; + } + /* privileged spec version check */ if (env->priv_ver < csr_min_priv) { return RISCV_EXCP_ILLEGAL_INST; @@ -3814,7 +3819,6 @@ static inline RISCVException riscv_csrrw_check(CPURISCVState *env, * illegal instruction exception should be triggered instead of virtual * instruction exception. Hence this comes after the read / write check. */ - g_assert(csr_ops[csrno].predicate != NULL); RISCVException ret = csr_ops[csrno].predicate(env, csrno); if (ret != RISCV_EXCP_NONE) { return ret; @@ -3991,7 +3995,10 @@ RISCVException riscv_csrrw_debug(CPURISCVState *env, int csrno, return ret; } -/* Control and Status Register function table */ +/* + * Control and Status Register function table + * riscv_csr_operations::predicate() must be provided for an implemented CSR + */ riscv_csr_operations csr_ops[CSR_TABLE_SIZE] = { /* User Floating-Point CSRs */ [CSR_FFLAGS] = { "fflags", fs, read_fflags, write_fflags }, From 488ad8b302e58add463e5f1f5619f3b7a2f98f23 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 20 Apr 2023 15:27:49 +0100 Subject: [PATCH 27/76] hw/pci-bridge: pci_expander_bridge fix type in pxb_cxl_dev_reset() Reproduce issue with configure --enable-qom-cast-debug ... qemu-system-x86_64 -display none -machine q35,cxl=on -device pxb-cxl,bus=pcie.0 hw/pci-bridge/pci_expander_bridge.c:54:PXB_DEV: Object 0x5570e0b1ada0 is not an instance of type pxb Aborted The type conversion results in the right state structure, but PXB_DEV is not a parent of PXB_CXL_DEV hence the error. Rather than directly cleaning up the inheritance, this is the minimal fix which will be followed by the cleanup. Fixes: 154070eaf6 ("hw/pxb-cxl: Support passthrough HDM Decoders unless overridden") Reported-by: Peter Maydell Signed-off-by: Jonathan Cameron Message-Id: <20230420142750.6950-2-Jonathan.Cameron@huawei.com> Reviewed-by: Thomas Huth Cc: qemu-stable@nongnu.org Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 9136f661c7277777a2f85a7e98438f4fe6472fdc) Signed-off-by: Michael Tokarev --- hw/pci-bridge/pci_expander_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index ead33f0c05..a78327b5f2 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -311,7 +311,7 @@ static void pxb_cxl_dev_reset(DeviceState *dev) * The CXL specification allows for host bridges with no HDM decoders * if they only have a single root port. */ - if (!PXB_DEV(dev)->hdm_for_passthrough) { + if (!PXB_CXL_DEV(dev)->hdm_for_passthrough) { dsp_count = pcie_count_ds_ports(hb->bus); } /* Initial reset will have 0 dsp so wait until > 0 */ From 4b59b5bd14861817c2d965316da9c180ec8d1364 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 5 May 2023 21:40:49 +0100 Subject: [PATCH 28/76] accel/tcg: Fix atomic_mmu_lookup for reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A copy-paste bug had us looking at the victim cache for writes. Cc: qemu-stable@nongnu.org Reported-by: Peter Maydell Signed-off-by: Richard Henderson Fixes: 08dff435e2 ("tcg: Probe the proper permissions for atomic ops") Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Peter Maydell Message-Id: <20230505204049.352469-1-richard.henderson@linaro.org> (cherry picked from commit 8c313254e61ed47a1bf4a2db714b25cdd94fbcce) Signed-off-by: Michael Tokarev --- accel/tcg/cputlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index e984a98dc4..145fba45b2 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -1830,7 +1830,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr, } else /* if (prot & PAGE_READ) */ { tlb_addr = tlbe->addr_read; if (!tlb_hit(tlb_addr, addr)) { - if (!VICTIM_TLB_HIT(addr_write, addr)) { + if (!VICTIM_TLB_HIT(addr_read, addr)) { tlb_fill(env_cpu(env), addr, size, MMU_DATA_LOAD, mmu_idx, retaddr); index = tlb_index(env, mmu_idx, addr); From 3148fe1ac86f3755addc2ace56750641f1bbfa34 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 12 May 2023 15:43:37 +0100 Subject: [PATCH 29/76] target/arm: Fix handling of SW and NSW bits for stage 2 walks We currently don't correctly handle the VSTCR_EL2.SW and VTCR_EL2.NSW configuration bits. These allow configuration of whether the stage 2 page table walks for Secure IPA and NonSecure IPA should do their descriptor reads from Secure or NonSecure physical addresses. (This is separate from how the translation table base address and other parameters are set: an NS IPA always uses VTTBR_EL2 and VTCR_EL2 for its base address and walk parameters, regardless of the NSW bit, and similarly for Secure.) Provide a new function ptw_idx_for_stage_2() which returns the MMU index to use for descriptor reads, and use it to set up the .in_ptw_idx wherever we call get_phys_addr_lpae(). For a stage 2 walk, wherever we call get_phys_addr_lpae(): * .in_ptw_idx should be ptw_idx_for_stage_2() of the .in_mmu_idx * .in_secure should be true if .in_mmu_idx is Stage2_S This allows us to correct S1_ptw_translate() so that it consistently always sets its (out_secure, out_phys) to the result it gets from the S2 walk (either by calling get_phys_addr_lpae() or by TLB lookup). This makes better conceptual sense because the S2 walk should return us an (address space, address) tuple, not an address that we then randomly assign to S or NS. Our previous handling of SW and NSW was broken, so guest code trying to use these bits to put the s2 page tables in the "other" address space wouldn't work correctly. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1600 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20230504135425.2748672-3-peter.maydell@linaro.org (cherry picked from commit fcc0b0418fff655f20fd0cf86a1bbdc41fd2e7c6) Signed-off-by: Michael Tokarev --- target/arm/ptw.c | 76 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 6d72950a79..00399a2e9c 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -103,6 +103,37 @@ ARMMMUIdx arm_stage1_mmu_idx(CPUARMState *env) return stage_1_mmu_idx(arm_mmu_idx(env)); } +/* + * Return where we should do ptw loads from for a stage 2 walk. + * This depends on whether the address we are looking up is a + * Secure IPA or a NonSecure IPA, which we know from whether this is + * Stage2 or Stage2_S. + * If this is the Secure EL1&0 regime we need to check the NSW and SW bits. + */ +static ARMMMUIdx ptw_idx_for_stage_2(CPUARMState *env, ARMMMUIdx stage2idx) +{ + bool s2walk_secure; + + /* + * We're OK to check the current state of the CPU here because + * (1) we always invalidate all TLBs when the SCR_EL3.NS bit changes + * (2) there's no way to do a lookup that cares about Stage 2 for a + * different security state to the current one for AArch64, and AArch32 + * never has a secure EL2. (AArch32 ATS12NSO[UP][RW] allow EL3 to do + * an NS stage 1+2 lookup while the NS bit is 0.) + */ + if (!arm_is_secure_below_el3(env) || !arm_el_is_aa64(env, 3)) { + return ARMMMUIdx_Phys_NS; + } + if (stage2idx == ARMMMUIdx_Stage2_S) { + s2walk_secure = !(env->cp15.vstcr_el2 & VSTCR_SW); + } else { + s2walk_secure = !(env->cp15.vtcr_el2 & VTCR_NSW); + } + return s2walk_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; + +} + static bool regime_translation_big_endian(CPUARMState *env, ARMMMUIdx mmu_idx) { return (regime_sctlr(env, mmu_idx) & SCTLR_EE) != 0; @@ -220,7 +251,6 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, ARMMMUIdx mmu_idx = ptw->in_mmu_idx; ARMMMUIdx s2_mmu_idx = ptw->in_ptw_idx; uint8_t pte_attrs; - bool pte_secure; ptw->out_virt = addr; @@ -232,8 +262,8 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, if (regime_is_stage2(s2_mmu_idx)) { S1Translate s2ptw = { .in_mmu_idx = s2_mmu_idx, - .in_ptw_idx = is_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS, - .in_secure = is_secure, + .in_ptw_idx = ptw_idx_for_stage_2(env, s2_mmu_idx), + .in_secure = s2_mmu_idx == ARMMMUIdx_Stage2_S, .in_debug = true, }; GetPhysAddrResult s2 = { }; @@ -244,12 +274,12 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, } ptw->out_phys = s2.f.phys_addr; pte_attrs = s2.cacheattrs.attrs; - pte_secure = s2.f.attrs.secure; + ptw->out_secure = s2.f.attrs.secure; } else { /* Regime is physical. */ ptw->out_phys = addr; pte_attrs = 0; - pte_secure = is_secure; + ptw->out_secure = s2_mmu_idx == ARMMMUIdx_Phys_S; } ptw->out_host = NULL; ptw->out_rw = false; @@ -270,7 +300,7 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, ptw->out_phys = full->phys_addr | (addr & ~TARGET_PAGE_MASK); ptw->out_rw = full->prot & PAGE_WRITE; pte_attrs = full->pte_attrs; - pte_secure = full->attrs.secure; + ptw->out_secure = full->attrs.secure; #else g_assert_not_reached(); #endif @@ -293,11 +323,6 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw, } } - /* Check if page table walk is to secure or non-secure PA space. */ - ptw->out_secure = (is_secure - && !(pte_secure - ? env->cp15.vstcr_el2 & VSTCR_SW - : env->cp15.vtcr_el2 & VTCR_NSW)); ptw->out_be = regime_translation_big_endian(env, mmu_idx); return true; @@ -2713,7 +2738,7 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, hwaddr ipa; int s1_prot, s1_lgpgsz; bool is_secure = ptw->in_secure; - bool ret, ipa_secure, s2walk_secure; + bool ret, ipa_secure; ARMCacheAttrs cacheattrs1; bool is_el0; uint64_t hcr; @@ -2727,20 +2752,11 @@ static bool get_phys_addr_twostage(CPUARMState *env, S1Translate *ptw, ipa = result->f.phys_addr; ipa_secure = result->f.attrs.secure; - if (is_secure) { - /* Select TCR based on the NS bit from the S1 walk. */ - s2walk_secure = !(ipa_secure - ? env->cp15.vstcr_el2 & VSTCR_SW - : env->cp15.vtcr_el2 & VTCR_NSW); - } else { - assert(!ipa_secure); - s2walk_secure = false; - } is_el0 = ptw->in_mmu_idx == ARMMMUIdx_Stage1_E0; - ptw->in_mmu_idx = s2walk_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2; - ptw->in_ptw_idx = s2walk_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; - ptw->in_secure = s2walk_secure; + ptw->in_mmu_idx = ipa_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2; + ptw->in_secure = ipa_secure; + ptw->in_ptw_idx = ptw_idx_for_stage_2(env, ptw->in_mmu_idx); /* * S1 is done, now do S2 translation. @@ -2848,6 +2864,16 @@ static bool get_phys_addr_with_struct(CPUARMState *env, S1Translate *ptw, ptw->in_ptw_idx = is_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2; break; + case ARMMMUIdx_Stage2: + case ARMMMUIdx_Stage2_S: + /* + * Second stage lookup uses physical for ptw; whether this is S or + * NS may depend on the SW/NSW bits if this is a stage 2 lookup for + * the Secure EL2&0 regime. + */ + ptw->in_ptw_idx = ptw_idx_for_stage_2(env, mmu_idx); + break; + case ARMMMUIdx_E10_0: s1_mmu_idx = ARMMMUIdx_Stage1_E0; goto do_twostage; @@ -2871,7 +2897,7 @@ static bool get_phys_addr_with_struct(CPUARMState *env, S1Translate *ptw, /* fall through */ default: - /* Single stage and second stage uses physical for ptw. */ + /* Single stage uses physical for ptw. */ ptw->in_ptw_idx = is_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; break; } From 80a2c1b5feb86c747dbfd580cda059a7b8107983 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 12 May 2023 15:43:38 +0100 Subject: [PATCH 30/76] ui: Fix pixel colour channel order for PNG screenshots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we take a PNG screenshot the ordering of the colour channels in the data is not correct, resulting in the image having weird colouring compared to the actual display. (Specifically, on a little-endian host the blue and red channels are swapped; on big-endian everything is wrong.) This happens because the pixman idea of the pixel data and the libpng idea differ. PIXMAN_a8r8g8b8 defines that pixels are 32-bit values, with A in bits 24-31, R in bits 16-23, G in bits 8-15 and B in bits 0-7. This means that on little-endian systems the bytes in memory are B G R A and on big-endian systems they are A R G B libpng, on the other hand, thinks of pixels as being a series of values for each channel, so its format PNG_COLOR_TYPE_RGB_ALPHA always wants bytes in the order R G B A This isn't the same as the pixman order for either big or little endian hosts. The alpha channel is also unnecessary bulk in the output PNG file, because there is no alpha information in a screenshot. To handle the endianness issue, we already define in ui/qemu-pixman.h various PIXMAN_BE_* and PIXMAN_LE_* values that give consistent byte-order pixel channel formats. So we can use PIXMAN_BE_r8g8b8 and PNG_COLOR_TYPE_RGB, which both have an in-memory byte order of R G B and 3 bytes per pixel. (PPM format screenshots get this right; they already use the PIXMAN_BE_r8g8b8 format.) Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1622 Fixes: 9a0a119a382867 ("Added parameter to take screenshot with screendump as PNG") Signed-off-by: Peter Maydell Reviewed-by: Marc-André Lureau Message-id: 20230502135548.2451309-1-peter.maydell@linaro.org (cherry picked from commit cd22a0f520f471e3bd33bc19cf3b2fa772cdb2a8) Signed-off-by: Michael Tokarev --- ui/console.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/console.c b/ui/console.c index 6e8a3cdc62..e173731e20 100644 --- a/ui/console.c +++ b/ui/console.c @@ -311,7 +311,7 @@ static bool png_save(int fd, pixman_image_t *image, Error **errp) png_struct *png_ptr; png_info *info_ptr; g_autoptr(pixman_image_t) linebuf = - qemu_pixman_linebuf_create(PIXMAN_a8r8g8b8, width); + qemu_pixman_linebuf_create(PIXMAN_BE_r8g8b8, width); uint8_t *buf = (uint8_t *)pixman_image_get_data(linebuf); FILE *f = fdopen(fd, "wb"); int y; @@ -341,7 +341,7 @@ static bool png_save(int fd, pixman_image_t *image, Error **errp) png_init_io(png_ptr, f); png_set_IHDR(png_ptr, info_ptr, width, height, 8, - PNG_COLOR_TYPE_RGB_ALPHA, PNG_INTERLACE_NONE, + PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE); png_write_info(png_ptr, info_ptr); From e09f912550ef9a8519869005c7e71f1eae869475 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Tue, 9 May 2023 10:20:59 +0100 Subject: [PATCH 31/76] target/arm: Correct AArch64.S2MinTxSZ 32-bit EL1 input size check In check_s2_mmu_setup() we have a check that is attempting to implement the part of AArch64.S2MinTxSZ that is specific to when EL1 is AArch32: if !s1aarch64 then // EL1 is AArch32 min_txsz = Min(min_txsz, 24); Unfortunately we got this wrong in two ways: (1) The minimum txsz corresponds to a maximum inputsize, but we got the sense of the comparison wrong and were faulting for all inputsizes less than 40 bits (2) We try to implement this as an extra check that happens after we've done the same txsz checks we would do for an AArch64 EL1, but in fact the pseudocode is *loosening* the requirements, so that txsz values that would fault for an AArch64 EL1 do not fault for AArch32 EL1, because it does Min(old_min, 24), not Max(old_min, 24). You can see this also in the text of the Arm ARM in table D8-8, which shows that where the implemented PA size is less than 40 bits an AArch32 EL1 is still OK with a configured stage2 T0SZ for a 40 bit IPA, whereas if EL1 is AArch64 then the T0SZ must be big enough to constrain the IPA to the implemented PA size. Because of part (2), we can't do this as a separate check, but have to integrate it into aa64_va_parameters(). Add a new argument to that function to indicate that EL1 is 32-bit. All the existing callsites except the one in get_phys_addr_lpae() can pass 'false', because they are either doing a lookup for a stage 1 regime or else they don't care about the tsz/tsz_oob fields. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1627 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20230509092059.3176487-1-peter.maydell@linaro.org (cherry picked from commit 478dccbb99db0bf8f00537dd0b4d0de88d5cb537) Signed-off-by: Michael Tokarev --- target/arm/gdbstub64.c | 2 +- target/arm/helper.c | 15 +++++++++++++-- target/arm/internals.h | 12 +++++++++++- target/arm/ptw.c | 14 ++------------ target/arm/tcg/pauth_helper.c | 6 +++--- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/target/arm/gdbstub64.c b/target/arm/gdbstub64.c index c1f7e8c934..d7b79a6589 100644 --- a/target/arm/gdbstub64.c +++ b/target/arm/gdbstub64.c @@ -233,7 +233,7 @@ int aarch64_gdb_get_pauth_reg(CPUARMState *env, GByteArray *buf, int reg) ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); ARMVAParameters param; - param = aa64_va_parameters(env, -is_high, mmu_idx, is_data); + param = aa64_va_parameters(env, -is_high, mmu_idx, is_data, false); return gdb_get_reg64(buf, pauth_ptr_mask(param)); } default: diff --git a/target/arm/helper.c b/target/arm/helper.c index 2297626bfb..0b7fd2e7e6 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -4904,7 +4904,7 @@ static TLBIRange tlbi_aa64_get_range(CPUARMState *env, ARMMMUIdx mmuidx, unsigned int page_size_granule, page_shift, num, scale, exponent; /* Extract one bit to represent the va selector in use. */ uint64_t select = sextract64(value, 36, 1); - ARMVAParameters param = aa64_va_parameters(env, select, mmuidx, true); + ARMVAParameters param = aa64_va_parameters(env, select, mmuidx, true, false); TLBIRange ret = { }; ARMGranuleSize gran; @@ -11193,7 +11193,8 @@ static ARMGranuleSize sanitize_gran_size(ARMCPU *cpu, ARMGranuleSize gran, } ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, - ARMMMUIdx mmu_idx, bool data) + ARMMMUIdx mmu_idx, bool data, + bool el1_is_aa32) { uint64_t tcr = regime_tcr(env, mmu_idx); bool epd, hpd, tsz_oob, ds, ha, hd; @@ -11289,6 +11290,16 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, } } + if (stage2 && el1_is_aa32) { + /* + * For AArch32 EL1 the min txsz (and thus max IPA size) requirements + * are loosened: a configured IPA of 40 bits is permitted even if + * the implemented PA is less than that (and so a 40 bit IPA would + * fault for an AArch64 EL1). See R_DTLMN. + */ + min_tsz = MIN(min_tsz, 24); + } + if (tsz > max_tsz) { tsz = max_tsz; tsz_oob = true; diff --git a/target/arm/internals.h b/target/arm/internals.h index c2c70d5918..f82b3db411 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1091,8 +1091,18 @@ typedef struct ARMVAParameters { ARMGranuleSize gran : 2; } ARMVAParameters; +/** + * aa64_va_parameters: Return parameters for an AArch64 virtual address + * @env: CPU + * @va: virtual address to look up + * @mmu_idx: determines translation regime to use + * @data: true if this is a data access + * @el1_is_aa32: true if we are asking about stage 2 when EL1 is AArch32 + * (ignored if @mmu_idx is for a stage 1 regime; only affects tsz/tsz_oob) + */ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, - ARMMMUIdx mmu_idx, bool data); + ARMMMUIdx mmu_idx, bool data, + bool el1_is_aa32); int aa64_va_parameter_tbi(uint64_t tcr, ARMMMUIdx mmu_idx); int aa64_va_parameter_tbid(uint64_t tcr, ARMMMUIdx mmu_idx); diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 00399a2e9c..48f5992348 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -1122,17 +1122,6 @@ static int check_s2_mmu_setup(ARMCPU *cpu, bool is_aa64, uint64_t tcr, sl0 = extract32(tcr, 6, 2); if (is_aa64) { - /* - * AArch64.S2InvalidTxSZ: While we checked tsz_oob near the top of - * get_phys_addr_lpae, that used aa64_va_parameters which apply - * to aarch64. If Stage1 is aarch32, the min_txsz is larger. - * See AArch64.S2MinTxSZ, where min_tsz is 24, translated to - * inputsize is 64 - 24 = 40. - */ - if (iasize < 40 && !arm_el_is_aa64(&cpu->env, 1)) { - goto fail; - } - /* * AArch64.S2InvalidSL: Interpretation of SL depends on the page size, * so interleave AArch64.S2StartLevel. @@ -1272,7 +1261,8 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, int ps; param = aa64_va_parameters(env, address, mmu_idx, - access_type != MMU_INST_FETCH); + access_type != MMU_INST_FETCH, + !arm_el_is_aa64(env, 1)); level = 0; /* diff --git a/target/arm/tcg/pauth_helper.c b/target/arm/tcg/pauth_helper.c index de067fa716..62af569341 100644 --- a/target/arm/tcg/pauth_helper.c +++ b/target/arm/tcg/pauth_helper.c @@ -293,7 +293,7 @@ static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, ARMPACKey *key, bool data) { ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); - ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); + ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data, false); uint64_t pac, ext_ptr, ext, test; int bot_bit, top_bit; @@ -355,7 +355,7 @@ static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier, ARMPACKey *key, bool data, int keynumber) { ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); - ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); + ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data, false); int bot_bit, top_bit; uint64_t pac, orig_ptr, test; @@ -379,7 +379,7 @@ static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier, static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data) { ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); - ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); + ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data, false); return pauth_original_ptr(ptr, param); } From 950882af6769428df019b578579250634a755e09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 20 Apr 2023 22:29:39 +0200 Subject: [PATCH 32/76] async: Suppress GCC13 false positive in aio_bh_poll() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC13 reports an error : ../util/async.c: In function ‘aio_bh_poll’: include/qemu/queue.h:303:22: error: storing the address of local variable ‘slice’ in ‘*ctx.bh_slice_list.sqh_last’ [-Werror=dangling-pointer=] 303 | (head)->sqh_last = &(elm)->field.sqe_next; \ | ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~ ../util/async.c:169:5: note: in expansion of macro ‘QSIMPLEQ_INSERT_TAIL’ 169 | QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next); | ^~~~~~~~~~~~~~~~~~~~ ../util/async.c:161:17: note: ‘slice’ declared here 161 | BHListSlice slice; | ^~~~~ ../util/async.c:161:17: note: ‘ctx’ declared here But the local variable 'slice' is removed from the global context list in following loop of the same routine. Add a pragma to silent GCC. Cc: Stefan Hajnoczi Cc: Paolo Bonzini Cc: Daniel P. Berrangé Signed-off-by: Cédric Le Goater Reviewed-by: Daniel Henrique Barboza Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Reviewed-by: Thomas Huth Tested-by: Daniel Henrique Barboza Message-Id: <20230420202939.1982044-1-clg@kaod.org> Signed-off-by: Paolo Bonzini (cherry picked from commit d66ba6dc1cce914673bd8a89fca30a7715ea70d1) Signed-off-by: Michael Tokarev (Mjt: cherry-picked to stable-8.0 to eliminate CI failures on win*) --- util/async.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/util/async.c b/util/async.c index 21016a1ac7..856e1a8a33 100644 --- a/util/async.c +++ b/util/async.c @@ -164,7 +164,21 @@ int aio_bh_poll(AioContext *ctx) /* Synchronizes with QSLIST_INSERT_HEAD_ATOMIC in aio_bh_enqueue(). */ QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list); + + /* + * GCC13 [-Werror=dangling-pointer=] complains that the local variable + * 'slice' is being stored in the global 'ctx->bh_slice_list' but the + * list is emptied before this function returns. + */ +#if !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic ignored "-Wdangling-pointer=" +#endif QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next); +#if !defined(__clang__) +#pragma GCC diagnostic pop +#endif while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) { QEMUBH *bh; From 7ceebe3f90476d8a40b1172364bbdb393eb2ae68 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Thu, 4 May 2023 05:35:39 -0400 Subject: [PATCH 33/76] tcg: ppc64: Fix mask generation for vextractdm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In function do_extractm() the mask is calculated as dup_const(1 << (element_width - 1)). '1' being signed int works fine for MO_8,16,32. For MO_64, on PPC64 host this ends up becoming 0 on compilation. The vextractdm uses MO_64, and it ends up having mask as 0. Explicitly use 1ULL instead of signed int 1 like its used everywhere else. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1536 Signed-off-by: Shivaprasad G Bhat Reviewed-by: Alex Bennée Reviewed-by: Lucas Mateus Castro Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Message-Id: <168319292809.1159309.5817546227121323288.stgit@ltc-boston1.aus.stglabs.ibm.com> Signed-off-by: Daniel Henrique Barboza (cherry picked from commit 6a5d81b17201ab8a95539bad94c8a6c08a42e076) Signed-off-by: Michael Tokarev --- target/ppc/translate/vmx-impl.c.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 112233b541..c8712dd7d8 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2058,7 +2058,7 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb *a) static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece) { const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece, - mask = dup_const(vece, 1 << (elem_width - 1)); + mask = dup_const(vece, 1ULL << (elem_width - 1)); uint64_t i, j; TCGv_i64 lo, hi, t0, t1; From b858c53ef632b80f3269773a18b17639b1eec62c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 27 Apr 2023 01:58:12 +0200 Subject: [PATCH 34/76] target/s390x: Fix EXECUTE of relative branches Fix a problem similar to the one fixed by commit 703d03a4aaf3 ("target/s390x: Fix EXECUTE of relative long instructions"), but now for relative branches. Reported-by: Nina Schoetterl-Glausch Signed-off-by: Ilya Leoshkevich Reviewed-by: Richard Henderson Message-Id: <20230426235813.198183-2-iii@linux.ibm.com> Signed-off-by: Thomas Huth (cherry picked from commit e8ecdfeb30f087574191cde523e846e023911c8d) Signed-off-by: Michael Tokarev --- target/s390x/tcg/translate.c | 81 ++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c index 2d9b4bbb1f..056594300a 100644 --- a/target/s390x/tcg/translate.c +++ b/target/s390x/tcg/translate.c @@ -1534,18 +1534,51 @@ static DisasJumpType op_bal(DisasContext *s, DisasOps *o) } } +/* + * Disassemble the target of a branch. The results are returned in a form + * suitable for passing into help_branch(): + * + * - bool IS_IMM reflects whether the target is fixed or computed. Non-EXECUTEd + * branches, whose DisasContext *S contains the relative immediate field RI, + * are considered fixed. All the other branches are considered computed. + * - int IMM is the value of RI. + * - TCGv_i64 CDEST is the address of the computed target. + */ +#define disas_jdest(s, ri, is_imm, imm, cdest) do { \ + if (have_field(s, ri)) { \ + if (unlikely(s->ex_value)) { \ + cdest = tcg_temp_new_i64(); \ + tcg_gen_ld_i64(cdest, cpu_env, offsetof(CPUS390XState, ex_target));\ + tcg_gen_addi_i64(cdest, cdest, (int64_t)get_field(s, ri) * 2); \ + is_imm = false; \ + } else { \ + is_imm = true; \ + } \ + } else { \ + is_imm = false; \ + } \ + imm = is_imm ? get_field(s, ri) : 0; \ +} while (false) + static DisasJumpType op_basi(DisasContext *s, DisasOps *o) { + DisasCompare c; + bool is_imm; + int imm; + pc_to_link_info(o->out, s, s->pc_tmp); - return help_goto_direct(s, s->base.pc_next + (int64_t)get_field(s, i2) * 2); + + disas_jdest(s, i2, is_imm, imm, o->in2); + disas_jcc(s, &c, 0xf); + return help_branch(s, &c, is_imm, imm, o->in2); } static DisasJumpType op_bc(DisasContext *s, DisasOps *o) { int m1 = get_field(s, m1); - bool is_imm = have_field(s, i2); - int imm = is_imm ? get_field(s, i2) : 0; DisasCompare c; + bool is_imm; + int imm; /* BCR with R2 = 0 causes no branching */ if (have_field(s, r2) && get_field(s, r2) == 0) { @@ -1562,6 +1595,7 @@ static DisasJumpType op_bc(DisasContext *s, DisasOps *o) return DISAS_NEXT; } + disas_jdest(s, i2, is_imm, imm, o->in2); disas_jcc(s, &c, m1); return help_branch(s, &c, is_imm, imm, o->in2); } @@ -1569,10 +1603,10 @@ static DisasJumpType op_bc(DisasContext *s, DisasOps *o) static DisasJumpType op_bct32(DisasContext *s, DisasOps *o) { int r1 = get_field(s, r1); - bool is_imm = have_field(s, i2); - int imm = is_imm ? get_field(s, i2) : 0; DisasCompare c; + bool is_imm; TCGv_i64 t; + int imm; c.cond = TCG_COND_NE; c.is_64 = false; @@ -1584,6 +1618,7 @@ static DisasJumpType op_bct32(DisasContext *s, DisasOps *o) c.u.s32.b = tcg_constant_i32(0); tcg_gen_extrl_i64_i32(c.u.s32.a, t); + disas_jdest(s, i2, is_imm, imm, o->in2); return help_branch(s, &c, is_imm, imm, o->in2); } @@ -1611,9 +1646,9 @@ static DisasJumpType op_bcth(DisasContext *s, DisasOps *o) static DisasJumpType op_bct64(DisasContext *s, DisasOps *o) { int r1 = get_field(s, r1); - bool is_imm = have_field(s, i2); - int imm = is_imm ? get_field(s, i2) : 0; DisasCompare c; + bool is_imm; + int imm; c.cond = TCG_COND_NE; c.is_64 = true; @@ -1622,6 +1657,7 @@ static DisasJumpType op_bct64(DisasContext *s, DisasOps *o) c.u.s64.a = regs[r1]; c.u.s64.b = tcg_constant_i64(0); + disas_jdest(s, i2, is_imm, imm, o->in2); return help_branch(s, &c, is_imm, imm, o->in2); } @@ -1629,10 +1665,10 @@ static DisasJumpType op_bx32(DisasContext *s, DisasOps *o) { int r1 = get_field(s, r1); int r3 = get_field(s, r3); - bool is_imm = have_field(s, i2); - int imm = is_imm ? get_field(s, i2) : 0; DisasCompare c; + bool is_imm; TCGv_i64 t; + int imm; c.cond = (s->insn->data ? TCG_COND_LE : TCG_COND_GT); c.is_64 = false; @@ -1645,6 +1681,7 @@ static DisasJumpType op_bx32(DisasContext *s, DisasOps *o) tcg_gen_extrl_i64_i32(c.u.s32.b, regs[r3 | 1]); store_reg32_i64(r1, t); + disas_jdest(s, i2, is_imm, imm, o->in2); return help_branch(s, &c, is_imm, imm, o->in2); } @@ -1652,9 +1689,9 @@ static DisasJumpType op_bx64(DisasContext *s, DisasOps *o) { int r1 = get_field(s, r1); int r3 = get_field(s, r3); - bool is_imm = have_field(s, i2); - int imm = is_imm ? get_field(s, i2) : 0; DisasCompare c; + bool is_imm; + int imm; c.cond = (s->insn->data ? TCG_COND_LE : TCG_COND_GT); c.is_64 = true; @@ -1668,6 +1705,7 @@ static DisasJumpType op_bx64(DisasContext *s, DisasOps *o) tcg_gen_add_i64(regs[r1], regs[r1], regs[r3]); c.u.s64.a = regs[r1]; + disas_jdest(s, i2, is_imm, imm, o->in2); return help_branch(s, &c, is_imm, imm, o->in2); } @@ -1685,10 +1723,9 @@ static DisasJumpType op_cj(DisasContext *s, DisasOps *o) c.u.s64.a = o->in1; c.u.s64.b = o->in2; - is_imm = have_field(s, i4); - if (is_imm) { - imm = get_field(s, i4); - } else { + o->out = NULL; + disas_jdest(s, i4, is_imm, imm, o->out); + if (!is_imm && !o->out) { imm = 0; o->out = get_address(s, 0, get_field(s, b4), get_field(s, d4)); @@ -5774,15 +5811,13 @@ static void in2_a2(DisasContext *s, DisasOps *o) static TCGv gen_ri2(DisasContext *s) { - int64_t delta = (int64_t)get_field(s, i2) * 2; - TCGv ri2; + TCGv ri2 = NULL; + bool is_imm; + int imm; - if (unlikely(s->ex_value)) { - ri2 = tcg_temp_new_i64(); - tcg_gen_ld_i64(ri2, cpu_env, offsetof(CPUS390XState, ex_target)); - tcg_gen_addi_i64(ri2, ri2, delta); - } else { - ri2 = tcg_constant_i64(s->base.pc_next + delta); + disas_jdest(s, i2, is_imm, imm, ri2); + if (is_imm) { + ri2 = tcg_constant_i64(s->base.pc_next + imm * 2); } return ri2; From e347aa89dd516db439c1f0b7b525eda5fb4507f9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 May 2023 15:47:26 +0200 Subject: [PATCH 35/76] s390x/tcg: Fix LDER instruction format It's RRE, not RXE. Found by running valgrind's none/tests/s390x/bfp-2. Fixes: 86b59624c4aa ("s390x/tcg: Implement LOAD LENGTHENED short HFP to long HFP") Reviewed-by: David Hildenbrand Signed-off-by: Ilya Leoshkevich Message-Id: <20230511134726.469651-1-iii@linux.ibm.com> Reviewed-by: Richard Henderson Signed-off-by: Thomas Huth (cherry picked from commit 970641de01908dd09b569965e78f13842e5854bc) Signed-off-by: Michael Tokarev --- target/s390x/tcg/insn-data.h.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc index 597d968b0e..1f1ac742a9 100644 --- a/target/s390x/tcg/insn-data.h.inc +++ b/target/s390x/tcg/insn-data.h.inc @@ -606,7 +606,7 @@ F(0xed04, LDEB, RXE, Z, 0, m2_32u, new, f1, ldeb, 0, IF_BFP) F(0xed05, LXDB, RXE, Z, 0, m2_64, new_x, x1, lxdb, 0, IF_BFP) F(0xed06, LXEB, RXE, Z, 0, m2_32u, new_x, x1, lxeb, 0, IF_BFP) - F(0xb324, LDER, RXE, Z, 0, e2, new, f1, lde, 0, IF_AFP1) + F(0xb324, LDER, RRE, Z, 0, e2, new, f1, lde, 0, IF_AFP1) F(0xed24, LDE, RXE, Z, 0, m2_32u, new, f1, lde, 0, IF_AFP1) /* LOAD ROUNDED */ F(0xb344, LEDBR, RRF_e, Z, 0, f2, new, e1, ledb, 0, IF_BFP) From 117f33c9a76791f436624ef21c2562c928b4c86f Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Tue, 2 May 2023 10:37:22 -0400 Subject: [PATCH 36/76] 9pfs/xen: Fix segfault on shutdown xen_9pfs_free can't use gnttabdev since it is already closed and NULL-ed out when free is called. Do the teardown in _disconnect(). This matches the setup done in _connect(). trace-events are also added for the XenDevOps functions. Signed-off-by: Jason Andryuk Reviewed-by: Stefano Stabellini Message-Id: <20230502143722.15613-1-jandryuk@gmail.com> [C.S.: - Remove redundant return in xen_9pfs_free(). - Add comment to trace-events. ] Signed-off-by: Christian Schoenebeck (cherry picked from commit 92e667f6fd5806a6a705a2a43e572bd9ec6819da) Signed-off-by: Michael Tokarev --- hw/9pfs/trace-events | 6 ++++++ hw/9pfs/xen-9p-backend.c | 35 ++++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events index 6c77966c0b..a12e55c165 100644 --- a/hw/9pfs/trace-events +++ b/hw/9pfs/trace-events @@ -48,3 +48,9 @@ v9fs_readlink(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d" v9fs_readlink_return(uint16_t tag, uint8_t id, char* target) "tag %d id %d name %s" v9fs_setattr(uint16_t tag, uint8_t id, int32_t fid, int32_t valid, int32_t mode, int32_t uid, int32_t gid, int64_t size, int64_t atime_sec, int64_t mtime_sec) "tag %u id %u fid %d iattr={valid %d mode %d uid %d gid %d size %"PRId64" atime=%"PRId64" mtime=%"PRId64" }" v9fs_setattr_return(uint16_t tag, uint8_t id) "tag %u id %u" + +# xen-9p-backend.c +xen_9pfs_alloc(char *name) "name %s" +xen_9pfs_connect(char *name) "name %s" +xen_9pfs_disconnect(char *name) "name %s" +xen_9pfs_free(char *name) "name %s" diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c index 74f3a05f88..382be1ab11 100644 --- a/hw/9pfs/xen-9p-backend.c +++ b/hw/9pfs/xen-9p-backend.c @@ -25,6 +25,8 @@ #include "qemu/iov.h" #include "fsdev/qemu-fsdev.h" +#include "trace.h" + #define VERSIONS "1" #define MAX_RINGS 8 #define MAX_RING_ORDER 9 @@ -336,6 +338,8 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev) Xen9pfsDev *xen_9pdev = container_of(xendev, Xen9pfsDev, xendev); int i; + trace_xen_9pfs_disconnect(xendev->name); + for (i = 0; i < xen_9pdev->num_rings; i++) { if (xen_9pdev->rings[i].evtchndev != NULL) { qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev), @@ -344,40 +348,41 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev) xen_9pdev->rings[i].local_port); xen_9pdev->rings[i].evtchndev = NULL; } - } -} - -static int xen_9pfs_free(struct XenLegacyDevice *xendev) -{ - Xen9pfsDev *xen_9pdev = container_of(xendev, Xen9pfsDev, xendev); - int i; - - if (xen_9pdev->rings[0].evtchndev != NULL) { - xen_9pfs_disconnect(xendev); - } - - for (i = 0; i < xen_9pdev->num_rings; i++) { if (xen_9pdev->rings[i].data != NULL) { xen_be_unmap_grant_refs(&xen_9pdev->xendev, xen_9pdev->rings[i].data, xen_9pdev->rings[i].intf->ref, (1 << xen_9pdev->rings[i].ring_order)); + xen_9pdev->rings[i].data = NULL; } if (xen_9pdev->rings[i].intf != NULL) { xen_be_unmap_grant_ref(&xen_9pdev->xendev, xen_9pdev->rings[i].intf, xen_9pdev->rings[i].ref); + xen_9pdev->rings[i].intf = NULL; } if (xen_9pdev->rings[i].bh != NULL) { qemu_bh_delete(xen_9pdev->rings[i].bh); + xen_9pdev->rings[i].bh = NULL; } } g_free(xen_9pdev->id); + xen_9pdev->id = NULL; g_free(xen_9pdev->tag); + xen_9pdev->tag = NULL; g_free(xen_9pdev->path); + xen_9pdev->path = NULL; g_free(xen_9pdev->security_model); + xen_9pdev->security_model = NULL; g_free(xen_9pdev->rings); + xen_9pdev->rings = NULL; +} + +static int xen_9pfs_free(struct XenLegacyDevice *xendev) +{ + trace_xen_9pfs_free(xendev->name); + return 0; } @@ -389,6 +394,8 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev) V9fsState *s = &xen_9pdev->state; QemuOpts *fsdev; + trace_xen_9pfs_connect(xendev->name); + if (xenstore_read_fe_int(&xen_9pdev->xendev, "num-rings", &xen_9pdev->num_rings) == -1 || xen_9pdev->num_rings > MAX_RINGS || xen_9pdev->num_rings < 1) { @@ -496,6 +503,8 @@ out: static void xen_9pfs_alloc(struct XenLegacyDevice *xendev) { + trace_xen_9pfs_alloc(xendev->name); + xenstore_write_be_str(xendev, "versions", VERSIONS); xenstore_write_be_int(xendev, "max-rings", MAX_RINGS); xenstore_write_be_int(xendev, "max-ring-page-order", MAX_RING_ORDER); From 36cd9bc8e2817dd5ae9cc7c427ec3a822eb1169d Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 12 May 2023 18:12:43 +0100 Subject: [PATCH 37/76] tcg/i386: Set P_REXW in tcg_out_addi_ptr The REXW bit must be set to produce a 64-bit pointer result; the bit is disabled in 32-bit mode, so we can do this unconditionally. Fixes: 7d9e1ee424b0 ("tcg/i386: Adjust assert in tcg_out_addi_ptr") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1592 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1642 Signed-off-by: Richard Henderson (cherry picked from commit 988998503bc6d8c03fbea001a0513e8372fddf28) Signed-off-by: Michael Tokarev --- tcg/i386/tcg-target.c.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index 5a151fe64a..5c7c180799 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -1083,7 +1083,7 @@ static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs, { /* This function is only used for passing structs by reference. */ tcg_debug_assert(imm == (int32_t)imm); - tcg_out_modrm_offset(s, OPC_LEA, rd, rs, imm); + tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm); } static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) From 21b54a683d14c0c6f9af35536d9059c60b7449ca Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 10 May 2023 12:55:31 +0200 Subject: [PATCH 38/76] s390x/pv: Fix spurious warning with asynchronous teardown Kernel commit 292a7d6fca33 ("KVM: s390: pv: fix asynchronous teardown for small VMs") causes the KVM_PV_ASYNC_CLEANUP_PREPARE ioctl to fail if the VM is not larger than 2GiB. QEMU would attempt it and fail, print an error message, and then proceed with a normal teardown. Avoid attempting to use asynchronous teardown altogether when the VM is not larger than 2 GiB. This will avoid triggering the error message and also avoid pointless overhead; normal teardown is fast enough for small VMs. Reported-by: Marc Hartmayer Fixes: c3a073c610 ("s390x/pv: Add support for asynchronous teardown for reboot") Link: https://lore.kernel.org/all/20230421085036.52511-2-imbrenda@linux.ibm.com/ Signed-off-by: Claudio Imbrenda Message-Id: <20230510105531.30623-2-imbrenda@linux.ibm.com> Reviewed-by: Thomas Huth [thuth: Fix inline function parameter in pv.h] Signed-off-by: Thomas Huth (cherry picked from commit 88693ab2a53f2f3d25cb39a7b5034ab391bc5a81) Signed-off-by: Michael Tokarev --- hw/s390x/pv.c | 10 ++++++++-- hw/s390x/s390-virtio-ccw.c | 2 +- include/hw/s390x/pv.h | 6 +++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c index 49ea38236c..b63f3784c6 100644 --- a/hw/s390x/pv.c +++ b/hw/s390x/pv.c @@ -13,6 +13,7 @@ #include +#include "qemu/units.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "sysemu/kvm.h" @@ -115,7 +116,7 @@ static void *s390_pv_do_unprot_async_fn(void *p) return NULL; } -bool s390_pv_vm_try_disable_async(void) +bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms) { /* * t is only needed to create the thread; once qemu_thread_create @@ -123,7 +124,12 @@ bool s390_pv_vm_try_disable_async(void) */ QemuThread t; - if (!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) { + /* + * If the feature is not present or if the VM is not larger than 2 GiB, + * KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it. + */ + if ((MACHINE(ms)->maxram_size <= 2 * GiB) || + !kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) { return false; } if (s390_pv_cmd(KVM_PV_ASYNC_CLEANUP_PREPARE, NULL) != 0) { diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 503f212a31..0daf445d60 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -330,7 +330,7 @@ static inline void s390_do_cpu_ipl(CPUState *cs, run_on_cpu_data arg) static void s390_machine_unprotect(S390CcwMachineState *ms) { - if (!s390_pv_vm_try_disable_async()) { + if (!s390_pv_vm_try_disable_async(ms)) { s390_pv_vm_disable(); } ms->pv = false; diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h index 966306a9db..7b935e2246 100644 --- a/include/hw/s390x/pv.h +++ b/include/hw/s390x/pv.h @@ -14,10 +14,10 @@ #include "qapi/error.h" #include "sysemu/kvm.h" +#include "hw/s390x/s390-virtio-ccw.h" #ifdef CONFIG_KVM #include "cpu.h" -#include "hw/s390x/s390-virtio-ccw.h" static inline bool s390_is_pv(void) { @@ -41,7 +41,7 @@ static inline bool s390_is_pv(void) int s390_pv_query_info(void); int s390_pv_vm_enable(void); void s390_pv_vm_disable(void); -bool s390_pv_vm_try_disable_async(void); +bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms); int s390_pv_set_sec_parms(uint64_t origin, uint64_t length); int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); void s390_pv_prep_reset(void); @@ -61,7 +61,7 @@ static inline bool s390_is_pv(void) { return false; } static inline int s390_pv_query_info(void) { return 0; } static inline int s390_pv_vm_enable(void) { return 0; } static inline void s390_pv_vm_disable(void) {} -static inline bool s390_pv_vm_try_disable_async(void) { return false; } +static inline bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms) { return false; } static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; } static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { return 0; } static inline void s390_pv_prep_reset(void) {} From 8ad637881f6ddf404bd554e149450a98ba8f5446 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 5 May 2023 14:00:51 +0200 Subject: [PATCH 39/76] util/async-teardown: wire up query-command-line-options Add new -run-with option with an async-teardown=on|off parameter. It is visible in the output of query-command-line-options QMP command, so it can be discovered and used by libvirt. The option -async-teardown is now redundant, deprecate it. Reported-by: Boris Fiuczynski Fixes: c891c24b1a ("os-posix: asynchronous teardown for shutdown on Linux") Signed-off-by: Claudio Imbrenda Message-Id: <20230505120051.36605-2-imbrenda@linux.ibm.com> [thuth: Add curly braces to fix error with GCC 8.5, fix bug in deprecated.rst] Signed-off-by: Thomas Huth (cherry picked from commit 80bd81cadd127c1e2fc784612a52abe392670ba4) Signed-off-by: Michael Tokarev (Mjt: context tweak in docs/about/deprecated.rst) --- docs/about/deprecated.rst | 4 ++++ os-posix.c | 14 ++++++++++++++ qemu-options.hx | 34 +++++++++++++++++++++++----------- util/async-teardown.c | 21 +++++++++++++++++++++ 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 914938fd76..2823362791 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -111,6 +111,10 @@ Use ``-machine acpi=off`` instead. The HAXM project has been retired (see https://github.com/intel/haxm#status). Use "whpx" (on Windows) or "hvf" (on macOS) instead. +``-async-teardown`` (since 8.1) +''''''''''''''''''''''''''''''' + +Use ``-run-with async-teardown=on`` instead. QEMU Machine Protocol (QMP) commands ------------------------------------ diff --git a/os-posix.c b/os-posix.c index 5adc69f560..90ea71725f 100644 --- a/os-posix.c +++ b/os-posix.c @@ -36,6 +36,8 @@ #include "qemu/log.h" #include "sysemu/runstate.h" #include "qemu/cutils.h" +#include "qemu/config-file.h" +#include "qemu/option.h" #ifdef CONFIG_LINUX #include @@ -152,9 +154,21 @@ int os_parse_cmd_args(int index, const char *optarg) daemonize = 1; break; #if defined(CONFIG_LINUX) + /* deprecated */ case QEMU_OPTION_asyncteardown: init_async_teardown(); break; + case QEMU_OPTION_run_with: { + QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("run-with"), + optarg, false); + if (!opts) { + exit(1); + } + if (qemu_opt_get_bool(opts, "async-teardown", false)) { + init_async_teardown(); + } + break; + } #endif default: return -1; diff --git a/qemu-options.hx b/qemu-options.hx index 4b8855a4f7..fdddfab6ff 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4786,20 +4786,32 @@ DEF("qtest-log", HAS_ARG, QEMU_OPTION_qtest_log, "", QEMU_ARCH_ALL) DEF("async-teardown", 0, QEMU_OPTION_asyncteardown, "-async-teardown enable asynchronous teardown\n", QEMU_ARCH_ALL) -#endif SRST ``-async-teardown`` - Enable asynchronous teardown. A new process called "cleanup/" - will be created at startup sharing the address space with the main qemu - process, using clone. It will wait for the main qemu process to - terminate completely, and then exit. - This allows qemu to terminate very quickly even if the guest was - huge, leaving the teardown of the address space to the cleanup - process. Since the cleanup process shares the same cgroups as the - main qemu process, accounting is performed correctly. This only - works if the cleanup process is not forcefully killed with SIGKILL - before the main qemu process has terminated completely. + This option is deprecated and should no longer be used. The new option + ``-run-with async-teardown=on`` is a replacement. ERST +DEF("run-with", HAS_ARG, QEMU_OPTION_run_with, + "-run-with async-teardown[=on|off]\n" + " misc QEMU process lifecycle options\n" + " async-teardown=on enables asynchronous teardown\n", + QEMU_ARCH_ALL) +SRST +``-run-with`` + Set QEMU process lifecycle options. + + ``async-teardown=on`` enables asynchronous teardown. A new process called + "cleanup/" will be created at startup sharing the address + space with the main QEMU process, using clone. It will wait for the + main QEMU process to terminate completely, and then exit. This allows + QEMU to terminate very quickly even if the guest was huge, leaving the + teardown of the address space to the cleanup process. Since the cleanup + process shares the same cgroups as the main QEMU process, accounting is + performed correctly. This only works if the cleanup process is not + forcefully killed with SIGKILL before the main QEMU process has + terminated completely. +ERST +#endif DEF("msg", HAS_ARG, QEMU_OPTION_msg, "-msg [timestamp[=on|off]][,guest-name=[on|off]]\n" diff --git a/util/async-teardown.c b/util/async-teardown.c index 62cdeb0f20..3ab19c8740 100644 --- a/util/async-teardown.c +++ b/util/async-teardown.c @@ -12,6 +12,9 @@ */ #include "qemu/osdep.h" +#include "qemu/config-file.h" +#include "qemu/option.h" +#include "qemu/module.h" #include #include #include @@ -144,3 +147,21 @@ void init_async_teardown(void) clone(async_teardown_fn, new_stack_for_clone(), CLONE_VM, NULL); sigprocmask(SIG_SETMASK, &old_signals, NULL); } + +static QemuOptsList qemu_run_with_opts = { + .name = "run-with", + .head = QTAILQ_HEAD_INITIALIZER(qemu_run_with_opts.head), + .desc = { + { + .name = "async-teardown", + .type = QEMU_OPT_BOOL, + }, + { /* end of list */ } + }, +}; + +static void register_teardown(void) +{ + qemu_add_opts(&qemu_run_with_opts); +} +opts_init(register_teardown); From eb82a80f512a15d3fec3cb3ffb2e0b93e3bc5b5f Mon Sep 17 00:00:00 2001 From: Lizhi Yang Date: Thu, 11 May 2023 16:01:19 +0800 Subject: [PATCH 40/76] docs/about/emulation: fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Duplicated word "are". Signed-off-by: Lizhi Yang Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20230511080119.99018-1-sledgeh4w@gmail.com> Signed-off-by: Thomas Huth (cherry picked from commit c70bb9a771d467302d1c7df5c5bd56b48f42716e) Signed-off-by: Michael Tokarev --- docs/about/emulation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/about/emulation.rst b/docs/about/emulation.rst index b510a54418..0ad0b86f0d 100644 --- a/docs/about/emulation.rst +++ b/docs/about/emulation.rst @@ -99,7 +99,7 @@ depending on the guest architecture. - Yes - A configurable 32 bit soft core now owned by Cadence -A number of features are are only available when running under +A number of features are only available when running under emulation including :ref:`Record/Replay` and :ref:`TCG Plugins`. .. _Semihosting: From 0b1b5a420428c24b106b97c454125ac7e346bca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 2 May 2023 15:20:59 +0100 Subject: [PATCH 41/76] tests/docker: bump the xtensa base to debian:11-slim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stretch is going out of support so things like security updates will fail. As the toolchain itself is binary it hopefully won't mind the underlying OS being updated. Message-Id: <20230503091244.1450613-3-alex.bennee@linaro.org> Reviewed-by: Thomas Huth Reviewed-by: Juan Quintela Signed-off-by: Alex Bennée Reported-by: Richard Henderson (cherry picked from commit 3217b84f3cd813a7daffc64b26543c313f3a042a) Signed-off-by: Michael Tokarev --- tests/docker/dockerfiles/debian-xtensa-cross.docker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/docker/dockerfiles/debian-xtensa-cross.docker b/tests/docker/dockerfiles/debian-xtensa-cross.docker index 082b50da19..72c25d63d9 100644 --- a/tests/docker/dockerfiles/debian-xtensa-cross.docker +++ b/tests/docker/dockerfiles/debian-xtensa-cross.docker @@ -5,7 +5,7 @@ # using a prebuilt toolchains for Xtensa cores from: # https://github.com/foss-xtensa/toolchain/releases # -FROM docker.io/library/debian:stretch-slim +FROM docker.io/library/debian:11-slim RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt install -yy eatmydata && \ From 69a6ea7c4bd2696b21b1644c40370c654a324c42 Mon Sep 17 00:00:00 2001 From: Daniil Kovalev Date: Tue, 4 Apr 2023 08:21:54 +0300 Subject: [PATCH 42/76] linux-user: Fix mips fp64 executables loading If a program requires fr1, we should set the FR bit of CP0 control status register and add F64 hardware flag. The corresponding `else if` branch statement is copied from the linux kernel sources (see `arch_check_elf` function in linux/arch/mips/kernel/elf.c). Signed-off-by: Daniil Kovalev Reviewed-by: Jiaxun Yang Message-Id: <20230404052153.16617-1-dkovalev@compiler-toolchain-for.me> Signed-off-by: Laurent Vivier (cherry picked from commit a0f8d2701b205d9d7986aa555e0566b13dc18fa0) Signed-off-by: Michael Tokarev --- linux-user/mips/cpu_loop.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/linux-user/mips/cpu_loop.c b/linux-user/mips/cpu_loop.c index d5c1c7941d..8735e58bad 100644 --- a/linux-user/mips/cpu_loop.c +++ b/linux-user/mips/cpu_loop.c @@ -290,7 +290,10 @@ void target_cpu_copy_regs(CPUArchState *env, struct target_pt_regs *regs) env->CP0_Status |= (1 << CP0St_FR); env->hflags |= MIPS_HFLAG_F64; } - } else if (!prog_req.fre && !prog_req.frdefault && + } else if (prog_req.fr1) { + env->CP0_Status |= (1 << CP0St_FR); + env->hflags |= MIPS_HFLAG_F64; + } else if (!prog_req.fre && !prog_req.frdefault && !prog_req.fr1 && !prog_req.single && !prog_req.soft) { fprintf(stderr, "qemu: Can't find a matching FPU mode\n"); exit(1); From 45a67df841d0d67b466fdb31a63f57d934444b5e Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Sun, 9 Apr 2023 13:53:27 +0300 Subject: [PATCH 43/76] linux-user: fix getgroups/setgroups allocations linux-user getgroups(), setgroups(), getgroups32() and setgroups32() used alloca() to allocate grouplist arrays, with unchecked gidsetsize coming from the "guest". With NGROUPS_MAX being 65536 (linux, and it is common for an application to allocate NGROUPS_MAX for getgroups()), this means a typical allocation is half the megabyte on the stack. Which just overflows stack, which leads to immediate SIGSEGV in actual system getgroups() implementation. An example of such issue is aptitude, eg https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=811087#72 Cap gidsetsize to NGROUPS_MAX (return EINVAL if it is larger than that), and use heap allocation for grouplist instead of alloca(). While at it, fix coding style and make all 4 implementations identical. Try to not impose random limits - for example, allow gidsetsize to be negative for getgroups() - just do not allocate negative-sized grouplist in this case but still do actual getgroups() call. But do not allow negative gidsetsize for setgroups() since its argument is unsigned. Capping by NGROUPS_MAX seems a bit arbitrary, - we can do more, it is not an error if set size will be NGROUPS_MAX+1. But we should not allow integer overflow for the array being allocated. Maybe it is enough to just call g_try_new() and return ENOMEM if it fails. Maybe there's also no need to convert setgroups() since this one is usually smaller and known beforehand (KERN_NGROUPS_MAX is actually 63, - this is apparently a kernel-imposed limit for runtime group set). The patch fixes aptitude segfault mentioned above. Signed-off-by: Michael Tokarev Message-Id: <20230409105327.1273372-1-mjt@msgid.tls.msk.ru> Signed-off-by: Laurent Vivier (cherry picked from commit 1e35d327890bdd117a67f79c52e637fb12bb1bf4) Signed-off-by: Michael Tokarev --- linux-user/syscall.c | 105 +++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 69f740ff98..333e6b7026 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -11475,39 +11475,58 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, { int gidsetsize = arg1; target_id *target_grouplist; - gid_t *grouplist; + g_autofree gid_t *grouplist = NULL; int i; - grouplist = alloca(gidsetsize * sizeof(gid_t)); - ret = get_errno(getgroups(gidsetsize, grouplist)); - if (gidsetsize == 0) - return ret; - if (!is_error(ret)) { - target_grouplist = lock_user(VERIFY_WRITE, arg2, gidsetsize * sizeof(target_id), 0); - if (!target_grouplist) - return -TARGET_EFAULT; - for(i = 0;i < ret; i++) - target_grouplist[i] = tswapid(high2lowgid(grouplist[i])); - unlock_user(target_grouplist, arg2, gidsetsize * sizeof(target_id)); + if (gidsetsize > NGROUPS_MAX) { + return -TARGET_EINVAL; } + if (gidsetsize > 0) { + grouplist = g_try_new(gid_t, gidsetsize); + if (!grouplist) { + return -TARGET_ENOMEM; + } + } + ret = get_errno(getgroups(gidsetsize, grouplist)); + if (!is_error(ret) && gidsetsize > 0) { + target_grouplist = lock_user(VERIFY_WRITE, arg2, + gidsetsize * sizeof(target_id), 0); + if (!target_grouplist) { + return -TARGET_EFAULT; + } + for (i = 0; i < ret; i++) { + target_grouplist[i] = tswapid(high2lowgid(grouplist[i])); + } + unlock_user(target_grouplist, arg2, + gidsetsize * sizeof(target_id)); + } + return ret; } - return ret; case TARGET_NR_setgroups: { int gidsetsize = arg1; target_id *target_grouplist; - gid_t *grouplist = NULL; + g_autofree gid_t *grouplist = NULL; int i; - if (gidsetsize) { - grouplist = alloca(gidsetsize * sizeof(gid_t)); - target_grouplist = lock_user(VERIFY_READ, arg2, gidsetsize * sizeof(target_id), 1); + + if (gidsetsize > NGROUPS_MAX || gidsetsize < 0) { + return -TARGET_EINVAL; + } + if (gidsetsize > 0) { + grouplist = g_try_new(gid_t, gidsetsize); + if (!grouplist) { + return -TARGET_ENOMEM; + } + target_grouplist = lock_user(VERIFY_READ, arg2, + gidsetsize * sizeof(target_id), 1); if (!target_grouplist) { return -TARGET_EFAULT; } for (i = 0; i < gidsetsize; i++) { grouplist[i] = low2highgid(tswapid(target_grouplist[i])); } - unlock_user(target_grouplist, arg2, 0); + unlock_user(target_grouplist, arg2, + gidsetsize * sizeof(target_id)); } return get_errno(setgroups(gidsetsize, grouplist)); } @@ -11792,41 +11811,59 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, { int gidsetsize = arg1; uint32_t *target_grouplist; - gid_t *grouplist; + g_autofree gid_t *grouplist = NULL; int i; - grouplist = alloca(gidsetsize * sizeof(gid_t)); + if (gidsetsize > NGROUPS_MAX) { + return -TARGET_EINVAL; + } + if (gidsetsize > 0) { + grouplist = g_try_new(gid_t, gidsetsize); + if (!grouplist) { + return -TARGET_ENOMEM; + } + } ret = get_errno(getgroups(gidsetsize, grouplist)); - if (gidsetsize == 0) - return ret; - if (!is_error(ret)) { - target_grouplist = lock_user(VERIFY_WRITE, arg2, gidsetsize * 4, 0); + if (!is_error(ret) && gidsetsize > 0) { + target_grouplist = lock_user(VERIFY_WRITE, arg2, + gidsetsize * 4, 0); if (!target_grouplist) { return -TARGET_EFAULT; } - for(i = 0;i < ret; i++) + for (i = 0; i < ret; i++) { target_grouplist[i] = tswap32(grouplist[i]); + } unlock_user(target_grouplist, arg2, gidsetsize * 4); } + return ret; } - return ret; #endif #ifdef TARGET_NR_setgroups32 case TARGET_NR_setgroups32: { int gidsetsize = arg1; uint32_t *target_grouplist; - gid_t *grouplist; + g_autofree gid_t *grouplist = NULL; int i; - grouplist = alloca(gidsetsize * sizeof(gid_t)); - target_grouplist = lock_user(VERIFY_READ, arg2, gidsetsize * 4, 1); - if (!target_grouplist) { - return -TARGET_EFAULT; + if (gidsetsize > NGROUPS_MAX || gidsetsize < 0) { + return -TARGET_EINVAL; + } + if (gidsetsize > 0) { + grouplist = g_try_new(gid_t, gidsetsize); + if (!grouplist) { + return -TARGET_ENOMEM; + } + target_grouplist = lock_user(VERIFY_READ, arg2, + gidsetsize * 4, 1); + if (!target_grouplist) { + return -TARGET_EFAULT; + } + for (i = 0; i < gidsetsize; i++) { + grouplist[i] = tswap32(target_grouplist[i]); + } + unlock_user(target_grouplist, arg2, 0); } - for(i = 0;i < gidsetsize; i++) - grouplist[i] = tswap32(target_grouplist[i]); - unlock_user(target_grouplist, arg2, 0); return get_errno(setgroups(gidsetsize, grouplist)); } #endif From cb898262a455a65e55c3b2821b26b7e93929c74d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Apr 2023 10:33:58 -0500 Subject: [PATCH 44/76] migration: Handle block device inactivation failures better Consider what happens when performing a migration between two host machines connected to an NFS server serving multiple block devices to the guest, when the NFS server becomes unavailable. The migration attempts to inactivate all block devices on the source (a necessary step before the destination can take over); but if the NFS server is non-responsive, the attempt to inactivate can itself fail. When that happens, the destination fails to get the migrated guest (good, because the source wasn't able to flush everything properly): (qemu) qemu-kvm: load of migration failed: Input/output error at which point, our only hope for the guest is for the source to take back control. With the current code base, the host outputs a message, but then appears to resume: (qemu) qemu-kvm: qemu_savevm_state_complete_precopy_non_iterable: bdrv_inactivate_all() failed (-1) (src qemu)info status VM status: running but a second migration attempt now asserts: (src qemu) qemu-kvm: ../block.c:6738: int bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & BDRV_O_INACTIVE)' failed. Whether the guest is recoverable on the source after the first failure is debatable, but what we do not want is to have qemu itself fail due to an assertion. It looks like the problem is as follows: In migration.c:migration_completion(), the source sets 'inactivate' to true (since COLO is not enabled), then tries savevm.c:qemu_savevm_state_complete_precopy() with a request to inactivate block devices. In turn, this calls block.c:bdrv_inactivate_all(), which fails when flushing runs up against the non-responsive NFS server. With savevm failing, we are now left in a state where some, but not all, of the block devices have been inactivated; but migration_completion() then jumps to 'fail' rather than 'fail_invalidate' and skips an attempt to reclaim those those disks by calling bdrv_activate_all(). Even if we do attempt to reclaim disks, we aren't taking note of failure there, either. Thus, we have reached a state where the migration engine has forgotten all state about whether a block device is inactive, because we did not set s->block_inactive in enough places; so migration allows the source to reach vm_start() and resume execution, violating the block layer invariant that the guest CPUs should not be restarted while a device is inactive. Note that the code in migration.c:migrate_fd_cancel() will also try to reactivate all block devices if s->block_inactive was set, but because we failed to set that flag after the first failure, the source assumes it has reclaimed all devices, even though it still has remaining inactivated devices and does not try again. Normally, qmp_cont() will also try to reactivate all disks (or correctly fail if the disks are not reclaimable because NFS is not yet back up), but the auto-resumption of the source after a migration failure does not go through qmp_cont(). And because we have left the block layer in an inconsistent state with devices still inactivated, the later migration attempt is hitting the assertion failure. Since it is important to not resume the source with inactive disks, this patch marks s->block_inactive before attempting inactivation, rather than after succeeding, in order to prevent any vm_start() until it has successfully reactivated all devices. See also https://bugzilla.redhat.com/show_bug.cgi?id=2058982 Signed-off-by: Eric Blake Reviewed-by: Juan Quintela Acked-by: Lukas Straub Tested-by: Lukas Straub Signed-off-by: Juan Quintela (cherry picked from commit 403d18ae384239876764bbfa111d6cc5dcb673d1) Signed-off-by: Michael Tokarev --- migration/migration.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index bda4789193..cb0d42c061 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3444,13 +3444,11 @@ static void migration_completion(MigrationState *s) MIGRATION_STATUS_DEVICE); } if (ret >= 0) { + s->block_inactive = inactivate; qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, inactivate); } - if (inactivate && ret >= 0) { - s->block_inactive = true; - } } qemu_mutex_unlock_iothread(); @@ -3522,6 +3520,7 @@ fail_invalidate: bdrv_activate_all(&local_err); if (local_err) { error_report_err(local_err); + s->block_inactive = true; } else { s->block_inactive = false; } From d2a811dd7d95e3c35271edad066504d76e83dea4 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 20 Apr 2023 09:35:51 -0500 Subject: [PATCH 45/76] migration: Minor control flow simplification No need to declare a temporary variable. Suggested-by: Juan Quintela Fixes: 1df36e8c6289 ("migration: Handle block device inactivation failures better") Signed-off-by: Eric Blake Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela (cherry picked from commit 5d39f44d7ac5c63f53d4d0900ceba9521bc27e49) Signed-off-by: Michael Tokarev --- migration/migration.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index cb0d42c061..08007cef4e 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3436,7 +3436,6 @@ static void migration_completion(MigrationState *s) ret = global_state_store(); if (!ret) { - bool inactivate = !migrate_colo_enabled(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); trace_migration_completion_vm_stop(ret); if (ret >= 0) { @@ -3444,10 +3443,10 @@ static void migration_completion(MigrationState *s) MIGRATION_STATUS_DEVICE); } if (ret >= 0) { - s->block_inactive = inactivate; + s->block_inactive = !migrate_colo_enabled(); qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, - inactivate); + s->block_inactive); } } qemu_mutex_unlock_iothread(); From c0ad2a91914751819ddbf49d19043f9cbcbe2651 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 2 May 2023 15:52:12 -0500 Subject: [PATCH 46/76] migration: Attempt disk reactivation in more failure scenarios Commit fe904ea824 added a fail_inactivate label, which tries to reactivate disks on the source after a failure while s->state == MIGRATION_STATUS_ACTIVE, but didn't actually use the label if qemu_savevm_state_complete_precopy() failed. This failure to reactivate is also present in commit 6039dd5b1c (also covering the new s->state == MIGRATION_STATUS_DEVICE state) and 403d18ae (ensuring s->block_inactive is set more reliably). Consolidate the two labels back into one - no matter HOW migration is failed, if there is any chance we can reach vm_start() after having attempted inactivation, it is essential that we have tried to restart disks before then. This also makes the cleanup more like migrate_fd_cancel(). Suggested-by: Kevin Wolf Signed-off-by: Eric Blake Message-Id: <20230502205212.134680-1-eblake@redhat.com> Acked-by: Peter Xu Reviewed-by: Juan Quintela Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf (cherry picked from commit 6dab4c93ecfae48e2e67b984d1032c1e988d3005) Signed-off-by: Michael Tokarev (Mjt: minor context tweak near added comment in migration/migration.c) --- migration/migration.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 08007cef4e..99f86bd6c2 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3443,6 +3443,11 @@ static void migration_completion(MigrationState *s) MIGRATION_STATUS_DEVICE); } if (ret >= 0) { + /* + * Inactivate disks except in COLO, and track that we + * have done so in order to remember to reactivate + * them if migration fails or is cancelled. + */ s->block_inactive = !migrate_colo_enabled(); qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, @@ -3487,13 +3492,13 @@ static void migration_completion(MigrationState *s) rp_error = await_return_path_close_on_source(s); trace_migration_return_path_end_after(rp_error); if (rp_error) { - goto fail_invalidate; + goto fail; } } if (qemu_file_get_error(s->to_dst_file)) { trace_migration_completion_file_err(); - goto fail_invalidate; + goto fail; } if (migrate_colo_enabled() && s->state == MIGRATION_STATUS_ACTIVE) { @@ -3507,26 +3512,25 @@ static void migration_completion(MigrationState *s) return; -fail_invalidate: - /* If not doing postcopy, vm_start() will be called: let's regain - * control on images. - */ - if (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_DEVICE) { +fail: + if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_DEVICE)) { + /* + * If not doing postcopy, vm_start() will be called: let's + * regain control on images. + */ Error *local_err = NULL; qemu_mutex_lock_iothread(); bdrv_activate_all(&local_err); if (local_err) { error_report_err(local_err); - s->block_inactive = true; } else { s->block_inactive = false; } qemu_mutex_unlock_iothread(); } -fail: migrate_set_state(&s->state, current_active_state, MIGRATION_STATUS_FAILED); } From c283a4bc7621b37fc62fd0eac1932796d2de84ab Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 18 May 2023 10:31:43 +0100 Subject: [PATCH 47/76] target/arm: Fix vd == vm overlap in sve_ldff1_z If vd == vm, copy vm to scratch, so that we can pre-zero the output and still access the gather indicies. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1612 Signed-off-by: Richard Henderson Message-id: 20230504104232.1877774-1-richard.henderson@linaro.org Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell (cherry picked from commit a6771f2f5cbfbf312e2fb5b1627f38a6bf6321d0) Signed-off-by: Michael Tokarev --- target/arm/tcg/sve_helper.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index ccf5e5beca..0097522470 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -6727,6 +6727,7 @@ void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, intptr_t reg_off; SVEHostPage info; target_ulong addr, in_page; + ARMVectorReg scratch; /* Skip to the first true predicate. */ reg_off = find_next_active(vg, 0, reg_max, esz); @@ -6736,6 +6737,11 @@ void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, return; } + /* Protect against overlap between vd and vm. */ + if (unlikely(vd == vm)) { + vm = memcpy(&scratch, vm, reg_max); + } + /* * Probe the first element, allowing faults. */ From 1e029102e6b6b176d692944d9df94c6b46185912 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 10 May 2023 18:15:25 +0200 Subject: [PATCH 48/76] scsi-generic: fix buffer overflow on block limits inquiry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using linux 6.x guest, at boot time, an inquiry on a scsi-generic device makes qemu crash. This is caused by a buffer overflow when scsi-generic patches the block limits VPD page. Do the operations on a temporary on-stack buffer that is guaranteed to be large enough. Reported-by: Théo Maillart Analyzed-by: Théo Maillart Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini (cherry picked from commit 9bd634b2f5e2f10fe35d7609eb83f30583f2e15a) Signed-off-by: Michael Tokarev --- hw/scsi/scsi-generic.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index ac9fa662b4..2417f0ad84 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -191,12 +191,16 @@ static int scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s, int len) if ((s->type == TYPE_DISK || s->type == TYPE_ZBC) && (r->req.cmd.buf[1] & 0x01)) { page = r->req.cmd.buf[2]; - if (page == 0xb0) { + if (page == 0xb0 && r->buflen >= 8) { + uint8_t buf[16] = {}; + uint8_t buf_used = MIN(r->buflen, 16); uint64_t max_transfer = calculate_max_transfer(s); - stl_be_p(&r->buf[8], max_transfer); - /* Also take care of the opt xfer len. */ - stl_be_p(&r->buf[12], - MIN_NON_ZERO(max_transfer, ldl_be_p(&r->buf[12]))); + + memcpy(buf, r->buf, buf_used); + stl_be_p(&buf[8], max_transfer); + stl_be_p(&buf[12], MIN_NON_ZERO(max_transfer, ldl_be_p(&buf[12]))); + memcpy(r->buf + 8, buf + 8, buf_used - 8); + } else if (s->needs_vpd_bl_emulation && page == 0x00 && r->buflen >= 4) { /* * Now we're capable of supplying the VPD Block Limits From db8051ad59f416a91ad4121d3d11f52e2879c429 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 9 May 2023 16:17:15 +0200 Subject: [PATCH 49/76] target/i386: fix operand size for VCOMI/VUCOMI instructions Compared to other SSE instructions, VUCOMISx and VCOMISx are different: the single and double precision versions are distinguished through a prefix, however they use no-prefix and 0x66 for SS and SD respectively. Scalar values usually are associated with 0xF2 and 0xF3. Because of these, they incorrectly perform a 128-bit memory load instead of a 32- or 64-bit load. Fix this by writing a custom decoding function. I tested that the reproducer is fixed and the test-avx output does not change. Reported-by: Gabriele Svelto Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1637 Fixes: f8d19eec0d53 ("target/i386: reimplement 0x0f 0x28-0x2f, add AVX", 2022-10-18) Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini (cherry picked from commit 2b55e479e6fcbb466585fd25077a50c32e10dc3a) Signed-off-by: Michael Tokarev --- target/i386/tcg/decode-new.c.inc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 4fdd87750b..48fefaffdf 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -783,6 +783,17 @@ static void decode_0F2D(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui *entry = *decode_by_prefix(s, opcodes_0F2D); } +static void decode_VxCOMISx(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + /* + * VUCOMISx and VCOMISx are different and use no-prefix and 0x66 for SS and SD + * respectively. Scalar values usually are associated with 0xF2 and 0xF3, for + * which X86_VEX_REPScalar exists, but here it has to be decoded by hand. + */ + entry->s1 = entry->s2 = (s->prefix & PREFIX_DATA ? X86_SIZE_sd : X86_SIZE_ss); + entry->gen = (*b == 0x2E ? gen_VUCOMI : gen_VCOMI); +} + static void decode_sse_unary(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) { if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ))) { @@ -871,8 +882,8 @@ static const X86OpEntry opcodes_0F[256] = { [0x2B] = X86_OP_GROUP0(0F2B), [0x2C] = X86_OP_GROUP0(0F2C), [0x2D] = X86_OP_GROUP0(0F2D), - [0x2E] = X86_OP_ENTRY3(VUCOMI, None,None, V,x, W,x, vex4 p_00_66), - [0x2F] = X86_OP_ENTRY3(VCOMI, None,None, V,x, W,x, vex4 p_00_66), + [0x2E] = X86_OP_GROUP3(VxCOMISx, None,None, V,x, W,x, vex3 p_00_66), /* VUCOMISS/SD */ + [0x2F] = X86_OP_GROUP3(VxCOMISx, None,None, V,x, W,x, vex3 p_00_66), /* VCOMISS/SD */ [0x38] = X86_OP_GROUP0(0F38), [0x3a] = X86_OP_GROUP0(0F3A), From 0de51178193afc8bf614cbb2af2d69508408bbcb Mon Sep 17 00:00:00 2001 From: Xinyu Li Date: Wed, 10 May 2023 22:52:22 +0800 Subject: [PATCH 50/76] target/i386: fix avx2 instructions vzeroall and vpermdq vzeroall: xmm_regs should be used instead of xmm_t0 vpermdq: bit 3 and 7 of imm should be considered Signed-off-by: Xinyu Li Message-Id: <20230510145222.586487-1-lixinyu20s@ict.ac.cn> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini (cherry picked from commit 056d649007bc9fdae9f1d576e77c1316e9a34468) Signed-off-by: Michael Tokarev --- target/i386/ops_sse.h | 8 ++++++++ target/i386/tcg/emit.c.inc | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 0bd6bfad8a..fb63af7afa 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2497,6 +2497,14 @@ void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order) d->Q(1) = r1; d->Q(2) = r2; d->Q(3) = r3; + if (order & 0x8) { + d->Q(0) = 0; + d->Q(1) = 0; + } + if (order & 0x80) { + d->Q(2) = 0; + d->Q(3) = 0; + } } void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order) diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 95fb4f52fa..4fe8dec427 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -2285,7 +2285,7 @@ static void gen_VZEROALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *deco { TCGv_ptr ptr = tcg_temp_new_ptr(); - tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_t0)); + tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_regs)); gen_helper_memset(ptr, ptr, tcg_constant_i32(0), tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg))); } From a9144eed6c431ed760247388acff2b601f3721d9 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Tue, 9 May 2023 16:48:17 +0800 Subject: [PATCH 51/76] vhost: fix possible wrap in SVQ descriptor ring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU invokes vhost_svq_add() when adding a guest's element into SVQ. In vhost_svq_add(), it uses vhost_svq_available_slots() to check whether QEMU can add the element into SVQ. If there is enough space, then QEMU combines some out descriptors and some in descriptors into one descriptor chain, and adds it into `svq->vring.desc` by vhost_svq_vring_write_descs(). Yet the problem is that, `svq->shadow_avail_idx - svq->shadow_used_idx` in vhost_svq_available_slots() returns the number of occupied elements, or the number of descriptor chains, instead of the number of occupied descriptors, which may cause wrapping in SVQ descriptor ring. Here is an example. In vhost_handle_guest_kick(), QEMU forwards as many available buffers to device by virtqueue_pop() and vhost_svq_add_element(). virtqueue_pop() returns a guest's element, and then this element is added into SVQ by vhost_svq_add_element(), a wrapper to vhost_svq_add(). If QEMU invokes virtqueue_pop() and vhost_svq_add_element() `svq->vring.num` times, vhost_svq_available_slots() thinks QEMU just ran out of slots and everything should work fine. But in fact, virtqueue_pop() returns `svq->vring.num` elements or descriptor chains, more than `svq->vring.num` descriptors due to guest memory fragmentation, and this causes wrapping in SVQ descriptor ring. This bug is valid even before marking the descriptors used. If the guest memory is fragmented, SVQ must add chains so it can try to add more descriptors than possible. This patch solves it by adding `num_free` field in VhostShadowVirtqueue structure and updating this field in vhost_svq_add() and vhost_svq_get_buf(), to record the number of free descriptors. Fixes: 100890f7ca ("vhost: Shadow virtqueue buffers forwarding") Signed-off-by: Hawkins Jiawei Acked-by: Eugenio Pérez Message-Id: <20230509084817.3973-1-yin31149@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang (cherry picked from commit 5d410557dea452f6231a7c66155e29a37e168528) Signed-off-by: Michael Tokarev --- hw/virtio/vhost-shadow-virtqueue.c | 5 ++++- hw/virtio/vhost-shadow-virtqueue.h | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 8361e70d1b..bd7c12b6d3 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -68,7 +68,7 @@ bool vhost_svq_valid_features(uint64_t features, Error **errp) */ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq) { - return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx); + return svq->num_free; } /** @@ -263,6 +263,7 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, return -EINVAL; } + svq->num_free -= ndescs; svq->desc_state[qemu_head].elem = elem; svq->desc_state[qemu_head].ndescs = ndescs; vhost_svq_kick(svq); @@ -449,6 +450,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); svq->desc_next[last_used_chain] = svq->free_head; svq->free_head = used_elem.id; + svq->num_free += num; *len = used_elem.len; return g_steal_pointer(&svq->desc_state[used_elem.id].elem); @@ -659,6 +661,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, svq->iova_tree = iova_tree; svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq)); + svq->num_free = svq->vring.num; driver_size = vhost_svq_driver_area_size(svq); device_size = vhost_svq_device_area_size(svq); svq->vring.desc = qemu_memalign(qemu_real_host_page_size(), driver_size); diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index 926a4897b1..6efe051a70 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -107,6 +107,9 @@ typedef struct VhostShadowVirtqueue { /* Next head to consume from the device */ uint16_t last_used_idx; + + /* Size of SVQ vring free descriptors */ + uint16_t num_free; } VhostShadowVirtqueue; bool vhost_svq_valid_features(uint64_t features, Error **errp); From adc49750d2949b72240067a38b0a548b9949277b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 2 May 2023 21:27:02 -0300 Subject: [PATCH 52/76] hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 Since it's implementation on v8.0.0-rc0, having the PCI_ERR_UNCOR_MASK set for machine types < 8.0 will cause migration to fail if the target QEMU version is < 8.0.0 : qemu-system-x86_64: get_pci_config_device: Bad config data: i=0x10a read: 40 device: 0 cmask: ff wmask: 0 w1cmask:0 qemu-system-x86_64: Failed to load PCIDevice:config qemu-system-x86_64: Failed to load e1000e:parent_obj qemu-system-x86_64: error while loading state for instance 0x0 of device '0000:00:02.0/e1000e' qemu-system-x86_64: load of migration failed: Invalid argument The above test migrated a 7.2 machine type from QEMU master to QEMU 7.2.0, with this cmdline: ./qemu-system-x86_64 -M pc-q35-7.2 [-incoming XXX] In order to fix this, property x-pcie-err-unc-mask was introduced to control when PCI_ERR_UNCOR_MASK is enabled. This property is enabled by default, but is disabled if machine type <= 7.2. Fixes: 010746ae1d ("hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register") Suggested-by: Michael S. Tsirkin Signed-off-by: Leonardo Bras Message-Id: <20230503002701.854329-1-leobras@redhat.com> Reviewed-by: Jonathan Cameron Reviewed-by: Peter Xu Reviewed-by: Juan Quintela Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1576 Tested-by: Fiona Ebner Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f) Signed-off-by: Michael Tokarev --- hw/core/machine.c | 1 + hw/pci/pci.c | 2 ++ hw/pci/pcie_aer.c | 11 +++++++---- include/hw/pci/pci.h | 2 ++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index cd13b8b0a3..5060119952 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -43,6 +43,7 @@ GlobalProperty hw_compat_7_2[] = { { "e1000e", "migrate-timadj", "off" }, { "virtio-mem", "x-early-migration", "false" }, { "migration", "x-preempt-pre-7-2", "true" }, + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, }; const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index def5000e7b..8ad4349e96 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -79,6 +79,8 @@ static Property pci_props[] = { DEFINE_PROP_STRING("failover_pair_id", PCIDevice, failover_pair_id), DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), DEFINE_PROP_END_OF_LIST() }; diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c index 103667c368..374d593ead 100644 --- a/hw/pci/pcie_aer.c +++ b/hw/pci/pcie_aer.c @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, uint16_t offset, pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, PCI_ERR_UNC_SUPPORTED); - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - PCI_ERR_UNC_MASK_DEFAULT); - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - PCI_ERR_UNC_SUPPORTED); + + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + PCI_ERR_UNC_MASK_DEFAULT); + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + PCI_ERR_UNC_SUPPORTED); + } pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, PCI_ERR_UNC_SEVERITY_DEFAULT); diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index d5a40cd058..6dc6742fc4 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -207,6 +207,8 @@ enum { QEMU_PCIE_EXTCAP_INIT = (1 << QEMU_PCIE_EXTCAP_INIT_BITNR), #define QEMU_PCIE_CXL_BITNR 10 QEMU_PCIE_CAP_CXL = (1 << QEMU_PCIE_CXL_BITNR), +#define QEMU_PCIE_ERR_UNC_MASK_BITNR 11 + QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR), }; typedef struct PCIINTxRoute { From 302ac06ab9f0465e33912d6f24ec0f99e879caea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 4 May 2023 12:14:47 +0200 Subject: [PATCH 53/76] virtio-net: not enable vq reset feature unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 93a97dc5200a ("virtio-net: enable vq reset feature") enables unconditionally vq reset feature as long as the device is emulated. This makes impossible to actually disable the feature, and it causes migration problems from qemu version previous than 7.2. The entire final commit is unneeded as device system already enable or disable the feature properly. This reverts commit 93a97dc5200a95e63b99cb625f20b7ae802ba413. Fixes: 93a97dc5200a ("virtio-net: enable vq reset feature") Signed-off-by: Eugenio Pérez Message-Id: <20230504101447.389398-1-eperezma@redhat.com> Reviewed-by: Xuan Zhuo Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 1fac00f70b3261050af5564b20ca55c1b2a3059a) Signed-off-by: Michael Tokarev --- hw/net/virtio-net.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 53e1c32643..4ea33b6e2e 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -805,7 +805,6 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, } if (!get_vhost_net(nc->peer)) { - virtio_add_feature(&features, VIRTIO_F_RING_RESET); return features; } From 81d13aa5e0cafe771f9ec4c0db9145c379f330fb Mon Sep 17 00:00:00 2001 From: Mauro Matteo Cascella Date: Tue, 9 May 2023 09:53:17 +0200 Subject: [PATCH 54/76] virtio-crypto: fix NULL pointer dereference in virtio_crypto_free_request Ensure op_info is not NULL in case of QCRYPTODEV_BACKEND_ALG_SYM algtype. Fixes: 0e660a6f90a ("crypto: Introduce RSA algorithm") Signed-off-by: Mauro Matteo Cascella Reported-by: Yiming Tao Message-Id: <20230509075317.1132301-1-mcascell@redhat.com> Reviewed-by: Gonglei Reviewed-by: zhenwei pi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 3e69908907f8d3dd20d5753b0777a6e3824ba824) Signed-off-by: Michael Tokarev --- hw/virtio/virtio-crypto.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index 802e1b9659..a1d122b9aa 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -476,15 +476,17 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req) size_t max_len; CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info; - max_len = op_info->iv_len + - op_info->aad_len + - op_info->src_len + - op_info->dst_len + - op_info->digest_result_len; + if (op_info) { + max_len = op_info->iv_len + + op_info->aad_len + + op_info->src_len + + op_info->dst_len + + op_info->digest_result_len; - /* Zeroize and free request data structure */ - memset(op_info, 0, sizeof(*op_info) + max_len); - g_free(op_info); + /* Zeroize and free request data structure */ + memset(op_info, 0, sizeof(*op_info) + max_len); + g_free(op_info); + } } else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) { CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info; if (op_info) { From a91defe16b52787d1b738e83d71eb525b12257d8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 2 May 2023 14:41:33 -0400 Subject: [PATCH 55/76] aio-posix: do not nest poll handlers QEMU's event loop supports nesting, which means that event handler functions may themselves call aio_poll(). The condition that triggered a handler must be reset before the nested aio_poll() call, otherwise the same handler will be called and immediately re-enter aio_poll. This leads to an infinite loop and stack exhaustion. Poll handlers are especially prone to this issue, because they typically reset their condition by finishing the processing of pending work. Unfortunately it is during the processing of pending work that nested aio_poll() calls typically occur and the condition has not yet been reset. Disable a poll handler during ->io_poll_ready() so that a nested aio_poll() call cannot invoke ->io_poll_ready() again. As a result, the disabled poll handler and its associated fd handler do not run during the nested aio_poll(). Calling aio_set_fd_handler() from inside nested aio_poll() could cause it to run again. If the fd handler is pending inside nested aio_poll(), then it will also run again. In theory fd handlers can be affected by the same issue, but they are more likely to reset the condition before calling nested aio_poll(). This is a special case and it's somewhat complex, but I don't see a way around it as long as nested aio_poll() is supported. Cc: qemu-stable@nongnu.org Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2186181 Fixes: c38270692593 ("block: Mark bdrv_co_io_(un)plug() and callers GRAPH_RDLOCK") Cc: Kevin Wolf Cc: Emanuele Giuseppe Esposito Cc: Paolo Bonzini Signed-off-by: Stefan Hajnoczi Message-Id: <20230502184134.534703-2-stefanha@redhat.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf (cherry picked from commit 6d740fb01b9f0f5ea7a82f4d5e458d91940a19ee) Signed-off-by: Michael Tokarev --- util/aio-posix.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/util/aio-posix.c b/util/aio-posix.c index a8be940f76..34bc2a64d8 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -353,8 +353,19 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) poll_ready && revents == 0 && aio_node_check(ctx, node->is_external) && node->io_poll_ready) { + /* + * Remove temporarily to avoid infinite loops when ->io_poll_ready() + * calls aio_poll() before clearing the condition that made the poll + * handler become ready. + */ + QLIST_SAFE_REMOVE(node, node_poll); + node->io_poll_ready(node->opaque); + if (!QLIST_IS_INSERTED(node, node_poll)) { + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); + } + /* * Return early since revents was zero. aio_notify() does not count as * progress. From a0b89ba8458072677d5ab5f0ebafbf6bbb2606c9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 2 May 2023 14:41:34 -0400 Subject: [PATCH 56/76] tested: add test for nested aio_poll() in poll handlers Cc: qemu-stable@nongnu.org Signed-off-by: Stefan Hajnoczi Message-Id: <20230502184134.534703-3-stefanha@redhat.com> [kwolf: Restrict to CONFIG_POSIX, Windows doesn't support polling] Tested-by: Kevin Wolf Signed-off-by: Kevin Wolf (cherry picked from commit 844a12a63e12b1235a8fc17f9b278929dc6eb00e) Signed-off-by: Michael Tokarev --- tests/unit/meson.build | 5 +- tests/unit/test-nested-aio-poll.c | 130 ++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test-nested-aio-poll.c diff --git a/tests/unit/meson.build b/tests/unit/meson.build index 3bc78d8660..8ed81786ee 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -114,7 +114,10 @@ if have_block tests += {'test-crypto-xts': [crypto, io]} endif if 'CONFIG_POSIX' in config_host - tests += {'test-image-locking': [testblock]} + tests += { + 'test-image-locking': [testblock], + 'test-nested-aio-poll': [testblock], + } endif if config_host_data.get('CONFIG_REPLICATION') tests += {'test-replication': [testblock]} diff --git a/tests/unit/test-nested-aio-poll.c b/tests/unit/test-nested-aio-poll.c new file mode 100644 index 0000000000..9bbe18b839 --- /dev/null +++ b/tests/unit/test-nested-aio-poll.c @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Test that poll handlers are not re-entrant in nested aio_poll() + * + * Copyright Red Hat + * + * Poll handlers are usually level-triggered. That means they continue firing + * until the condition is reset (e.g. a virtqueue becomes empty). If a poll + * handler calls nested aio_poll() before the condition is reset, then infinite + * recursion occurs. + * + * aio_poll() is supposed to prevent this by disabling poll handlers in nested + * aio_poll() calls. This test case checks that this is indeed what happens. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "qapi/error.h" + +typedef struct { + AioContext *ctx; + + /* This is the EventNotifier that drives the test */ + EventNotifier poll_notifier; + + /* This EventNotifier is only used to wake aio_poll() */ + EventNotifier dummy_notifier; + + bool nested; +} TestData; + +static void io_read(EventNotifier *notifier) +{ + fprintf(stderr, "%s %p\n", __func__, notifier); + event_notifier_test_and_clear(notifier); +} + +static bool io_poll_true(void *opaque) +{ + fprintf(stderr, "%s %p\n", __func__, opaque); + return true; +} + +static bool io_poll_false(void *opaque) +{ + fprintf(stderr, "%s %p\n", __func__, opaque); + return false; +} + +static void io_poll_ready(EventNotifier *notifier) +{ + TestData *td = container_of(notifier, TestData, poll_notifier); + + fprintf(stderr, "> %s\n", __func__); + + g_assert(!td->nested); + td->nested = true; + + /* Wake the following nested aio_poll() call */ + event_notifier_set(&td->dummy_notifier); + + /* This nested event loop must not call io_poll()/io_poll_ready() */ + g_assert(aio_poll(td->ctx, true)); + + td->nested = false; + + fprintf(stderr, "< %s\n", __func__); +} + +/* dummy_notifier never triggers */ +static void io_poll_never_ready(EventNotifier *notifier) +{ + g_assert_not_reached(); +} + +static void test(void) +{ + TestData td = { + .ctx = aio_context_new(&error_abort), + }; + + qemu_set_current_aio_context(td.ctx); + + /* Enable polling */ + aio_context_set_poll_params(td.ctx, 1000000, 2, 2, &error_abort); + + /* + * The GSource is unused but this has the side-effect of changing the fdmon + * that AioContext uses. + */ + aio_get_g_source(td.ctx); + + /* Make the event notifier active (set) right away */ + event_notifier_init(&td.poll_notifier, 1); + aio_set_event_notifier(td.ctx, &td.poll_notifier, false, + io_read, io_poll_true, io_poll_ready); + + /* This event notifier will be used later */ + event_notifier_init(&td.dummy_notifier, 0); + aio_set_event_notifier(td.ctx, &td.dummy_notifier, false, + io_read, io_poll_false, io_poll_never_ready); + + /* Consume aio_notify() */ + g_assert(!aio_poll(td.ctx, false)); + + /* + * Run the io_read() handler. This has the side-effect of activating + * polling in future aio_poll() calls. + */ + g_assert(aio_poll(td.ctx, true)); + + /* The second time around the io_poll()/io_poll_ready() handler runs */ + g_assert(aio_poll(td.ctx, true)); + + /* Run io_poll()/io_poll_ready() one more time to show it keeps working */ + g_assert(aio_poll(td.ctx, true)); + + aio_set_event_notifier(td.ctx, &td.dummy_notifier, false, + NULL, NULL, NULL); + aio_set_event_notifier(td.ctx, &td.poll_notifier, false, NULL, NULL, NULL); + event_notifier_cleanup(&td.dummy_notifier); + event_notifier_cleanup(&td.poll_notifier); + aio_context_unref(td.ctx); +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/nested-aio-poll", test); + return g_test_run(); +} From 84d839e4998bf3845a2e16ea482a3da3d1598346 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 1 May 2023 13:34:43 -0400 Subject: [PATCH 57/76] block: compile out assert_bdrv_graph_readable() by default reader_count() is a performance bottleneck because the global aio_context_list_lock mutex causes thread contention. Put this debugging assertion behind a new ./configure --enable-debug-graph-lock option and disable it by default. The --enable-debug-graph-lock option is also enabled by the more general --enable-debug option. Signed-off-by: Stefan Hajnoczi Message-Id: <20230501173443.153062-1-stefanha@redhat.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf (cherry picked from commit 58a2e3f5c37be02dac3086b81bdda9414b931edf) Signed-off-by: Michael Tokarev (Mjt: pick this one up so the next patch which disables this applies cleanly) --- block/graph-lock.c | 3 +++ configure | 1 + meson.build | 2 ++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 4 ++++ 5 files changed, 12 insertions(+) diff --git a/block/graph-lock.c b/block/graph-lock.c index 454c31e691..259a7a0bde 100644 --- a/block/graph-lock.c +++ b/block/graph-lock.c @@ -265,7 +265,10 @@ void bdrv_graph_rdunlock_main_loop(void) void assert_bdrv_graph_readable(void) { + /* reader_count() is slow due to aio_context_list_lock lock contention */ +#ifdef CONFIG_DEBUG_GRAPH_LOCK assert(qemu_in_main_thread() || reader_count()); +#endif } void assert_bdrv_graph_writable(void) diff --git a/configure b/configure index 800b5850f4..a62a3e6be9 100755 --- a/configure +++ b/configure @@ -806,6 +806,7 @@ for opt do --enable-debug) # Enable debugging options that aren't excessively noisy debug_tcg="yes" + meson_option_parse --enable-debug-graph-lock "" meson_option_parse --enable-debug-mutex "" meson_option_add -Doptimization=0 fortify_source="no" diff --git a/meson.build b/meson.build index c7e486e087..30447cfaef 100644 --- a/meson.build +++ b/meson.build @@ -1956,6 +1956,7 @@ if get_option('debug_stack_usage') and have_coroutine_pool have_coroutine_pool = false endif config_host_data.set10('CONFIG_COROUTINE_POOL', have_coroutine_pool) +config_host_data.set('CONFIG_DEBUG_GRAPH_LOCK', get_option('debug_graph_lock')) config_host_data.set('CONFIG_DEBUG_MUTEX', get_option('debug_mutex')) config_host_data.set('CONFIG_DEBUG_STACK_USAGE', get_option('debug_stack_usage')) config_host_data.set('CONFIG_GPROF', get_option('gprof')) @@ -3837,6 +3838,7 @@ summary_info += {'PIE': get_option('b_pie')} summary_info += {'static build': config_host.has_key('CONFIG_STATIC')} summary_info += {'malloc trim support': has_malloc_trim} summary_info += {'membarrier': have_membarrier} +summary_info += {'debug graph lock': get_option('debug_graph_lock')} summary_info += {'debug stack usage': get_option('debug_stack_usage')} summary_info += {'mutex debugging': get_option('debug_mutex')} summary_info += {'memory allocator': get_option('malloc')} diff --git a/meson_options.txt b/meson_options.txt index fc9447d267..bc857fe68b 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -311,6 +311,8 @@ option('rng_none', type: 'boolean', value: false, description: 'dummy RNG, avoid using /dev/(u)random and getrandom()') option('coroutine_pool', type: 'boolean', value: true, description: 'coroutine freelist (better performance)') +option('debug_graph_lock', type: 'boolean', value: false, + description: 'graph lock debugging support') option('debug_mutex', type: 'boolean', value: false, description: 'mutex debugging support') option('debug_stack_usage', type: 'boolean', value: false, diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 009fab1515..30e1f25259 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -21,6 +21,8 @@ meson_options_help() { printf "%s\n" ' QEMU' printf "%s\n" ' --enable-cfi Control-Flow Integrity (CFI)' printf "%s\n" ' --enable-cfi-debug Verbose errors in case of CFI violation' + printf "%s\n" ' --enable-debug-graph-lock' + printf "%s\n" ' graph lock debugging support' printf "%s\n" ' --enable-debug-mutex mutex debugging support' printf "%s\n" ' --enable-debug-stack-usage' printf "%s\n" ' measure coroutine stack usage' @@ -249,6 +251,8 @@ _meson_option_parse() { --datadir=*) quote_sh "-Ddatadir=$2" ;; --enable-dbus-display) printf "%s" -Ddbus_display=enabled ;; --disable-dbus-display) printf "%s" -Ddbus_display=disabled ;; + --enable-debug-graph-lock) printf "%s" -Ddebug_graph_lock=true ;; + --disable-debug-graph-lock) printf "%s" -Ddebug_graph_lock=false ;; --enable-debug-mutex) printf "%s" -Ddebug_mutex=true ;; --disable-debug-mutex) printf "%s" -Ddebug_mutex=false ;; --enable-debug-stack-usage) printf "%s" -Ddebug_stack_usage=true ;; From d001f222e37d51aa8d691ada53b58725010e6837 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 17 May 2023 17:28:32 +0200 Subject: [PATCH 58/76] graph-lock: Disable locking for now In QEMU 8.0, we've been seeing deadlocks in bdrv_graph_wrlock(). They come from callers that hold an AioContext lock, which is not allowed during polling. In theory, we could temporarily release the lock, but callers are inconsistent about whether they hold a lock, and if they do, some are also confused about which one they hold. While all of this is fixable, it's not trivial, and the best course of action for 8.0.1 is probably just disabling the graph locking code temporarily. We don't currently rely on graph locking yet. It is supposed to replace the AioContext lock eventually to enable multiqueue support, but as long as we still have the AioContext lock, it is sufficient without the graph lock. Once the AioContext lock goes away, the deadlock doesn't exist any more either and this commit can be reverted. (Of course, it can also be reverted while the AioContext lock still exists if the callers have been fixed.) Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Message-Id: <20230517152834.277483-2-kwolf@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf (cherry picked from commit 80fc5d260002432628710f8b0c7cfc7d9b97bb9d) Signed-off-by: Michael Tokarev --- block/graph-lock.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/block/graph-lock.c b/block/graph-lock.c index 259a7a0bde..2490926c90 100644 --- a/block/graph-lock.c +++ b/block/graph-lock.c @@ -30,8 +30,10 @@ BdrvGraphLock graph_lock; /* Protects the list of aiocontext and orphaned_reader_count */ static QemuMutex aio_context_list_lock; +#if 0 /* Written and read with atomic operations. */ static int has_writer; +#endif /* * A reader coroutine could move from an AioContext to another. @@ -88,6 +90,7 @@ void unregister_aiocontext(AioContext *ctx) g_free(ctx->bdrv_graph); } +#if 0 static uint32_t reader_count(void) { BdrvGraphRWlock *brdv_graph; @@ -105,10 +108,17 @@ static uint32_t reader_count(void) assert((int32_t)rd >= 0); return rd; } +#endif void bdrv_graph_wrlock(void) { GLOBAL_STATE_CODE(); + /* + * TODO Some callers hold an AioContext lock when this is called, which + * causes deadlocks. Reenable once the AioContext locking is cleaned up (or + * AioContext locks are gone). + */ +#if 0 assert(!qatomic_read(&has_writer)); /* Make sure that constantly arriving new I/O doesn't cause starvation */ @@ -139,11 +149,13 @@ void bdrv_graph_wrlock(void) } while (reader_count() >= 1); bdrv_drain_all_end(); +#endif } void bdrv_graph_wrunlock(void) { GLOBAL_STATE_CODE(); +#if 0 QEMU_LOCK_GUARD(&aio_context_list_lock); assert(qatomic_read(&has_writer)); @@ -155,10 +167,13 @@ void bdrv_graph_wrunlock(void) /* Wake up all coroutine that are waiting to read the graph */ qemu_co_enter_all(&reader_queue, &aio_context_list_lock); +#endif } void coroutine_fn bdrv_graph_co_rdlock(void) { + /* TODO Reenable when wrlock is reenabled */ +#if 0 BdrvGraphRWlock *bdrv_graph; bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; @@ -223,10 +238,12 @@ void coroutine_fn bdrv_graph_co_rdlock(void) qemu_co_queue_wait(&reader_queue, &aio_context_list_lock); } } +#endif } void coroutine_fn bdrv_graph_co_rdunlock(void) { +#if 0 BdrvGraphRWlock *bdrv_graph; bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; @@ -249,6 +266,7 @@ void coroutine_fn bdrv_graph_co_rdunlock(void) if (qatomic_read(&has_writer)) { aio_wait_kick(); } +#endif } void bdrv_graph_rdlock_main_loop(void) @@ -266,13 +284,19 @@ void bdrv_graph_rdunlock_main_loop(void) void assert_bdrv_graph_readable(void) { /* reader_count() is slow due to aio_context_list_lock lock contention */ + /* TODO Reenable when wrlock is reenabled */ +#if 0 #ifdef CONFIG_DEBUG_GRAPH_LOCK assert(qemu_in_main_thread() || reader_count()); #endif +#endif } void assert_bdrv_graph_writable(void) { assert(qemu_in_main_thread()); + /* TODO Reenable when wrlock is reenabled */ +#if 0 assert(qatomic_read(&has_writer)); +#endif } From a7002f15c8635bae3f45a7e675920164df42b77d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 17 May 2023 17:28:33 +0200 Subject: [PATCH 59/76] nbd/server: Fix drained_poll to wake coroutine in right AioContext nbd_drained_poll() generally runs in the main thread, not whatever iothread the NBD server coroutine is meant to run in, so it can't directly reenter the coroutines to wake them up. The code seems to have the right intention, it specifies the correct AioContext when it calls qemu_aio_coroutine_enter(). However, this functions doesn't schedule the coroutine to run in that AioContext, but it assumes it is already called in the home thread of the AioContext. To fix this, add a new thread-safe qio_channel_wake_read() that can be called in the main thread to wake up the coroutine in its AioContext, and use this in nbd_drained_poll(). Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Message-Id: <20230517152834.277483-3-kwolf@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf (cherry picked from commit 7c1f51bf38de8cea4ed5030467646c37b46edeb7) Signed-off-by: Michael Tokarev --- include/io/channel.h | 10 ++++++++++ io/channel.c | 33 +++++++++++++++++++++++++++------ nbd/server.c | 3 +-- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index 153fbd2904..2b905423a9 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -757,6 +757,16 @@ void qio_channel_detach_aio_context(QIOChannel *ioc); void coroutine_fn qio_channel_yield(QIOChannel *ioc, GIOCondition condition); +/** + * qio_channel_wake_read: + * @ioc: the channel object + * + * If qio_channel_yield() is currently waiting for the channel to become + * readable, interrupt it and reenter immediately. This function is safe to call + * from any thread. + */ +void qio_channel_wake_read(QIOChannel *ioc); + /** * qio_channel_wait: * @ioc: the channel object diff --git a/io/channel.c b/io/channel.c index a8c7f11649..3c9b7beb65 100644 --- a/io/channel.c +++ b/io/channel.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "block/aio-wait.h" #include "io/channel.h" #include "qapi/error.h" #include "qemu/main-loop.h" @@ -514,7 +515,11 @@ int qio_channel_flush(QIOChannel *ioc, static void qio_channel_restart_read(void *opaque) { QIOChannel *ioc = opaque; - Coroutine *co = ioc->read_coroutine; + Coroutine *co = qatomic_xchg(&ioc->read_coroutine, NULL); + + if (!co) { + return; + } /* Assert that aio_co_wake() reenters the coroutine directly */ assert(qemu_get_current_aio_context() == @@ -525,7 +530,11 @@ static void qio_channel_restart_read(void *opaque) static void qio_channel_restart_write(void *opaque) { QIOChannel *ioc = opaque; - Coroutine *co = ioc->write_coroutine; + Coroutine *co = qatomic_xchg(&ioc->write_coroutine, NULL); + + if (!co) { + return; + } /* Assert that aio_co_wake() reenters the coroutine directly */ assert(qemu_get_current_aio_context() == @@ -568,7 +577,11 @@ void qio_channel_detach_aio_context(QIOChannel *ioc) void coroutine_fn qio_channel_yield(QIOChannel *ioc, GIOCondition condition) { + AioContext *ioc_ctx = ioc->ctx ?: qemu_get_aio_context(); + assert(qemu_in_coroutine()); + assert(in_aio_context_home_thread(ioc_ctx)); + if (condition == G_IO_IN) { assert(!ioc->read_coroutine); ioc->read_coroutine = qemu_coroutine_self(); @@ -580,18 +593,26 @@ void coroutine_fn qio_channel_yield(QIOChannel *ioc, } qio_channel_set_aio_fd_handlers(ioc); qemu_coroutine_yield(); + assert(in_aio_context_home_thread(ioc_ctx)); /* Allow interrupting the operation by reentering the coroutine other than * through the aio_fd_handlers. */ - if (condition == G_IO_IN && ioc->read_coroutine) { - ioc->read_coroutine = NULL; + if (condition == G_IO_IN) { + assert(ioc->read_coroutine == NULL); qio_channel_set_aio_fd_handlers(ioc); - } else if (condition == G_IO_OUT && ioc->write_coroutine) { - ioc->write_coroutine = NULL; + } else if (condition == G_IO_OUT) { + assert(ioc->write_coroutine == NULL); qio_channel_set_aio_fd_handlers(ioc); } } +void qio_channel_wake_read(QIOChannel *ioc) +{ + Coroutine *co = qatomic_xchg(&ioc->read_coroutine, NULL); + if (co) { + aio_co_wake(co); + } +} static gboolean qio_channel_wait_complete(QIOChannel *ioc, GIOCondition condition, diff --git a/nbd/server.c b/nbd/server.c index 3d8d0d81df..ea47522e8f 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1599,8 +1599,7 @@ static bool nbd_drained_poll(void *opaque) * enter it here so we don't depend on the client to wake it up. */ if (client->recv_coroutine != NULL && client->read_yielding) { - qemu_aio_coroutine_enter(exp->common.ctx, - client->recv_coroutine); + qio_channel_wake_read(client->ioc); } return true; From eb134d1d589f3ed693b40f2654ff478424b5d9cb Mon Sep 17 00:00:00 2001 From: "timothee.cocault@gmail.com" Date: Mon, 10 Apr 2023 17:27:48 +0200 Subject: [PATCH 60/76] e1000e: Fix tx/rx counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bytes and packets counter registers are cleared on read. Copying the "total counter" registers to the "good counter" registers has side effects. If the "total" register is never read by the OS, it only gets incremented. This leads to exponential growth of the "good" register. This commit increments the counters individually to avoid this. Signed-off-by: Timothée Cocault Signed-off-by: Jason Wang (cherry picked from commit 8d689f6aae8be096b4a1859be07c1b083865f755) Signed-off-by: Michael Tokarev --- hw/net/e1000.c | 5 ++--- hw/net/e1000e_core.c | 5 ++--- hw/net/e1000x_common.c | 5 ++--- hw/net/igb_core.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 23d660619f..59bacb5d3b 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -637,9 +637,8 @@ xmit_seg(E1000State *s) e1000x_inc_reg_if_not_full(s->mac_reg, TPT); e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size + 4); - s->mac_reg[GPTC] = s->mac_reg[TPT]; - s->mac_reg[GOTCL] = s->mac_reg[TOTL]; - s->mac_reg[GOTCH] = s->mac_reg[TOTH]; + e1000x_inc_reg_if_not_full(s->mac_reg, GPTC); + e1000x_grow_8reg_if_not_full(s->mac_reg, GOTCL, s->tx.size + 4); } static void diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index c0c09b6965..cfa3f55e96 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -711,9 +711,8 @@ e1000e_on_tx_done_update_stats(E1000ECore *core, struct NetTxPkt *tx_pkt) g_assert_not_reached(); } - core->mac[GPTC] = core->mac[TPT]; - core->mac[GOTCL] = core->mac[TOTL]; - core->mac[GOTCH] = core->mac[TOTH]; + e1000x_inc_reg_if_not_full(core->mac, GPTC); + e1000x_grow_8reg_if_not_full(core->mac, GOTCL, tot_len); } static void diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index b844af590a..4c8e7dcf70 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -220,15 +220,14 @@ e1000x_update_rx_total_stats(uint32_t *mac, e1000x_increase_size_stats(mac, PRCregs, data_fcs_size); e1000x_inc_reg_if_not_full(mac, TPR); - mac[GPRC] = mac[TPR]; + e1000x_inc_reg_if_not_full(mac, GPRC); /* TOR - Total Octets Received: * This register includes bytes received in a packet from the field through the field, inclusively. * Always include FCS length (4) in size. */ e1000x_grow_8reg_if_not_full(mac, TORL, data_size + 4); - mac[GORCL] = mac[TORL]; - mac[GORCH] = mac[TORH]; + e1000x_grow_8reg_if_not_full(mac, GORCL, data_size + 4); } void diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index d733fed6cf..826e7a6cf1 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -538,9 +538,8 @@ igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt *tx_pkt, int qn) g_assert_not_reached(); } - core->mac[GPTC] = core->mac[TPT]; - core->mac[GOTCL] = core->mac[TOTL]; - core->mac[GOTCH] = core->mac[TOTH]; + e1000x_inc_reg_if_not_full(core->mac, GPTC); + e1000x_grow_8reg_if_not_full(core->mac, GOTCL, tot_len); if (core->mac[MRQC] & 1) { uint16_t pool = qn % IGB_NUM_VM_POOLS; From 0f7ca2bf2c6476244113132539ff2d6ef1d47b34 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:54 +0900 Subject: [PATCH 61/76] e1000x: Fix BPRC and MPRC Before this change, e1000 and the common code updated BPRC and MPRC depending on the matched filter, but e1000e and igb decided to update those counters by deriving the packet type independently. This inconsistency caused a multicast packet to be counted twice. Updating BPRC and MPRC depending on are fundamentally flawed anyway as a filter can be used for different types of packets. For example, it is possible to filter broadcast packets with MTA. Always determine what counters to update by inspecting the packets. Fixes: 3b27430177 ("e1000: Implementing various counters") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang (cherry picked from commit f3f9b726afba1f53663768603189e574f80b5907) Signed-off-by: Michael Tokarev --- hw/net/e1000.c | 6 +++--- hw/net/e1000e_core.c | 20 +++----------------- hw/net/e1000x_common.c | 25 +++++++++++++++++++------ hw/net/e1000x_common.h | 5 +++-- hw/net/igb_core.c | 22 +++++----------------- 5 files changed, 33 insertions(+), 45 deletions(-) diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 59bacb5d3b..18eb6d8876 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -826,12 +826,10 @@ receive_filter(E1000State *s, const uint8_t *buf, int size) } if (ismcast && (rctl & E1000_RCTL_MPE)) { /* promiscuous mcast */ - e1000x_inc_reg_if_not_full(s->mac_reg, MPRC); return 1; } if (isbcast && (rctl & E1000_RCTL_BAM)) { /* broadcast enabled */ - e1000x_inc_reg_if_not_full(s->mac_reg, BPRC); return 1; } @@ -922,6 +920,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) size_t desc_offset; size_t desc_size; size_t total_size; + eth_pkt_types_e pkt_type; if (!e1000x_hw_rx_enabled(s->mac_reg)) { return -1; @@ -971,6 +970,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) size -= 4; } + pkt_type = get_eth_packet_type(PKT_GET_ETH_HDR(filter_buf)); rdh_start = s->mac_reg[RDH]; desc_offset = 0; total_size = size + e1000x_fcs_len(s->mac_reg); @@ -1036,7 +1036,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) } } while (desc_offset < total_size); - e1000x_update_rx_total_stats(s->mac_reg, size, total_size); + e1000x_update_rx_total_stats(s->mac_reg, pkt_type, size, total_size); n = E1000_ICS_RXT0; if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH]) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index cfa3f55e96..a74f1bc245 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1487,24 +1487,10 @@ e1000e_write_to_rx_buffers(E1000ECore *core, } static void -e1000e_update_rx_stats(E1000ECore *core, - size_t data_size, - size_t data_fcs_size) +e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size) { - e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size); - - switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { - case ETH_PKT_BCAST: - e1000x_inc_reg_if_not_full(core->mac, BPRC); - break; - - case ETH_PKT_MCAST: - e1000x_inc_reg_if_not_full(core->mac, MPRC); - break; - - default: - break; - } + eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt); + e1000x_update_rx_total_stats(core->mac, pkt_type, pkt_size, pkt_fcs_size); } static inline bool diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index 4c8e7dcf70..7694673bcc 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -80,7 +80,6 @@ bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf) f = mta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3]; f = (((buf[5] << 8) | buf[4]) >> f) & 0xfff; if (mac[MTA + (f >> 5)] & (1 << (f & 0x1f))) { - e1000x_inc_reg_if_not_full(mac, MPRC); return true; } @@ -212,13 +211,14 @@ e1000x_rxbufsize(uint32_t rctl) void e1000x_update_rx_total_stats(uint32_t *mac, - size_t data_size, - size_t data_fcs_size) + eth_pkt_types_e pkt_type, + size_t pkt_size, + size_t pkt_fcs_size) { static const int PRCregs[6] = { PRC64, PRC127, PRC255, PRC511, PRC1023, PRC1522 }; - e1000x_increase_size_stats(mac, PRCregs, data_fcs_size); + e1000x_increase_size_stats(mac, PRCregs, pkt_fcs_size); e1000x_inc_reg_if_not_full(mac, TPR); e1000x_inc_reg_if_not_full(mac, GPRC); /* TOR - Total Octets Received: @@ -226,8 +226,21 @@ e1000x_update_rx_total_stats(uint32_t *mac, * Address> field through the field, inclusively. * Always include FCS length (4) in size. */ - e1000x_grow_8reg_if_not_full(mac, TORL, data_size + 4); - e1000x_grow_8reg_if_not_full(mac, GORCL, data_size + 4); + e1000x_grow_8reg_if_not_full(mac, TORL, pkt_size + 4); + e1000x_grow_8reg_if_not_full(mac, GORCL, pkt_size + 4); + + switch (pkt_type) { + case ETH_PKT_BCAST: + e1000x_inc_reg_if_not_full(mac, BPRC); + break; + + case ETH_PKT_MCAST: + e1000x_inc_reg_if_not_full(mac, MPRC); + break; + + default: + break; + } } void diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h index 911abd8a90..0298e06283 100644 --- a/hw/net/e1000x_common.h +++ b/hw/net/e1000x_common.h @@ -91,8 +91,9 @@ e1000x_update_regs_on_link_up(uint32_t *mac, uint16_t *phy) } void e1000x_update_rx_total_stats(uint32_t *mac, - size_t data_size, - size_t data_fcs_size); + eth_pkt_types_e pkt_type, + size_t pkt_size, + size_t pkt_fcs_size); void e1000x_core_prepare_eeprom(uint16_t *eeprom, const uint16_t *templ, diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 826e7a6cf1..8a9fd1f729 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1437,29 +1437,17 @@ igb_write_to_rx_buffers(IGBCore *core, static void igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi, - size_t data_size, size_t data_fcs_size) + size_t pkt_size, size_t pkt_fcs_size) { - e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size); - - switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { - case ETH_PKT_BCAST: - e1000x_inc_reg_if_not_full(core->mac, BPRC); - break; - - case ETH_PKT_MCAST: - e1000x_inc_reg_if_not_full(core->mac, MPRC); - break; - - default: - break; - } + eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt); + e1000x_update_rx_total_stats(core->mac, pkt_type, pkt_size, pkt_fcs_size); if (core->mac[MRQC] & 1) { uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS; - core->mac[PVFGORC0 + (pool * 64)] += data_size + 4; + core->mac[PVFGORC0 + (pool * 64)] += pkt_size + 4; core->mac[PVFGPRC0 + (pool * 64)]++; - if (net_rx_pkt_get_packet_type(core->rx_pkt) == ETH_PKT_MCAST) { + if (pkt_type == ETH_PKT_MCAST) { core->mac[PVFMPRC0 + (pool * 64)]++; } } From 9ff3fe63fc54fdc53599cc258c1f7150644f86d3 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:55 +0900 Subject: [PATCH 62/76] igb: Fix Rx packet type encoding igb's advanced descriptor uses a packet type encoding different from one used in e1000e's extended descriptor. Fix the logic to encode Rx packet type accordingly. Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang (cherry picked from commit ed447c60b341f1714b3c800d7f9c68898e873f78) Signed-off-by: Michael Tokarev --- hw/net/igb_core.c | 40 ++++++++++++++++++++-------------------- hw/net/igb_regs.h | 5 +++++ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 8a9fd1f729..1c7f4eaf76 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1226,7 +1226,6 @@ igb_build_rx_metadata(IGBCore *core, struct virtio_net_hdr *vhdr; bool hasip4, hasip6; EthL4HdrProto l4hdr_proto; - uint32_t pkt_type; *status_flags = E1000_RXD_STAT_DD; @@ -1265,28 +1264,29 @@ igb_build_rx_metadata(IGBCore *core, trace_e1000e_rx_metadata_ack(); } - if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { - trace_e1000e_rx_metadata_ipv6_filtering_disabled(); - pkt_type = E1000_RXD_PKT_MAC; - } else if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP || - l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { - pkt_type = hasip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP; - } else if (hasip4 || hasip6) { - pkt_type = hasip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6; - } else { - pkt_type = E1000_RXD_PKT_MAC; - } - - trace_e1000e_rx_metadata_pkt_type(pkt_type); - if (pkt_info) { - if (rss_info->enabled) { - *pkt_info = rss_info->type; + *pkt_info = rss_info->enabled ? rss_info->type : 0; + + if (hasip4) { + *pkt_info |= E1000_ADVRXD_PKT_IP4; } - *pkt_info |= (pkt_type << 4); - } else { - *status_flags |= E1000_RXD_PKT_TYPE(pkt_type); + if (hasip6) { + *pkt_info |= E1000_ADVRXD_PKT_IP6; + } + + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + *pkt_info |= E1000_ADVRXD_PKT_TCP; + break; + + case ETH_L4_HDR_PROTO_UDP: + *pkt_info |= E1000_ADVRXD_PKT_UDP; + break; + + default: + break; + } } if (hdr_info) { diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index c5c5b3c3b8..21ee9a3b2d 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -641,6 +641,11 @@ union e1000_adv_rx_desc { #define E1000_STATUS_NUM_VFS_SHIFT 14 +#define E1000_ADVRXD_PKT_IP4 BIT(4) +#define E1000_ADVRXD_PKT_IP6 BIT(6) +#define E1000_ADVRXD_PKT_TCP BIT(8) +#define E1000_ADVRXD_PKT_UDP BIT(9) + static inline uint8_t igb_ivar_entry_rx(uint8_t i) { return i < 8 ? i * 4 : (i - 8) * 4 + 2; From 6e260100d074bf48f4502ce55ba65680324ffc0e Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:56 +0900 Subject: [PATCH 63/76] igb: Do not require CTRL.VME for tx VLAN tagging While the datasheet of e1000e says it checks CTRL.VME for tx VLAN tagging, igb's datasheet has no such statements. It also says for "CTRL.VLE": > This register only affects the VLAN Strip in Rx it does not have any > influence in the Tx path in the 82576. (Appendix A. Changes from the 82575) There is no "CTRL.VLE" so it is more likely that it is a mistake of CTRL.VME. Fixes: fba7c3b788 ("igb: respect VMVIR and VMOLR for VLAN") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang (cherry picked from commit e209716749cda1581cfc8e582591c0216c30ab0d) Signed-off-by: Michael Tokarev --- hw/net/igb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 1c7f4eaf76..bc7af7963a 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -402,7 +402,7 @@ igb_tx_insert_vlan(IGBCore *core, uint16_t qn, struct igb_tx *tx, } } - if (insert_vlan && e1000x_vlan_enabled(core->mac)) { + if (insert_vlan) { net_tx_pkt_setup_vlan_header_ex(tx->tx_pkt, vlan, core->mac[VET] & 0xffff); } From ba3c7bf178c8a94b2635a655918ee7ae991a9cf7 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:57 +0900 Subject: [PATCH 64/76] igb: Clear IMS bits when committing ICR access The datasheet says contradicting statements regarding ICR accesses so it is not reliable to determine the behavior of ICR accesses. However, e1000e does clear IMS bits when reading ICR accesses and Linux also expects ICR accesses will clear IMS bits according to: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/ethernet/intel/igb/igb_main.c?h=v6.2#n8048 Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang (cherry picked from commit f0b1df5c4502b5ec89f83417924935ab201511d0) Signed-off-by: Michael Tokarev --- hw/net/igb_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index bc7af7963a..e0ee70d1d7 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -2451,16 +2451,16 @@ igb_set_ims(IGBCore *core, int index, uint32_t val) static void igb_commit_icr(IGBCore *core) { /* - * If GPIE.NSICR = 0, then the copy of IAM to IMS will occur only if at + * If GPIE.NSICR = 0, then the clear of IMS will occur only if at * least one bit is set in the IMS and there is a true interrupt as * reflected in ICR.INTA. */ if ((core->mac[GPIE] & E1000_GPIE_NSICR) || (core->mac[IMS] && (core->mac[ICR] & E1000_ICR_INT_ASSERTED))) { - igb_set_ims(core, IMS, core->mac[IAM]); - } else { - igb_update_interrupt_state(core); + igb_clear_ims_bits(core, core->mac[IAM]); } + + igb_update_interrupt_state(core); } static void igb_set_icr(IGBCore *core, int index, uint32_t val) From 5c4f2f1b60e7d95ccc16262c3040d4c5f2f62126 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:58 +0900 Subject: [PATCH 65/76] net/net_rx_pkt: Use iovec for net_rx_pkt_set_protocols() igb does not properly ensure the buffer passed to net_rx_pkt_set_protocols() is contiguous for the entire L2/L3/L4 header. Allow it to pass scattered data to net_rx_pkt_set_protocols(). Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang (cherry picked from commit 2f0fa232b8c330df029120a6824c8be3d4eb5cae) Signed-off-by: Michael Tokarev --- hw/net/igb_core.c | 2 +- hw/net/net_rx_pkt.c | 14 +++++--------- hw/net/net_rx_pkt.h | 10 ++++++---- hw/net/virtio-net.c | 7 +++++-- hw/net/vmxnet3.c | 7 ++++++- include/net/eth.h | 6 +++--- net/eth.c | 18 ++++++++---------- 7 files changed, 34 insertions(+), 30 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index e0ee70d1d7..adcb8ac6f3 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1649,7 +1649,7 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, ehdr = PKT_GET_ETH_HDR(filter_buf); net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr)); - net_rx_pkt_set_protocols(core->rx_pkt, filter_buf, size); + net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs); queues = igb_receive_assign(core, ehdr, size, &rss_info, external_tx); if (!queues) { diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 39cdea06de..63be6e05ad 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -103,7 +103,7 @@ net_rx_pkt_pull_data(struct NetRxPkt *pkt, iov, iovcnt, ploff, pkt->tot_len); } - eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->hasip4, &pkt->hasip6, + eth_get_protocols(pkt->vec, pkt->vec_len, 0, &pkt->hasip4, &pkt->hasip6, &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off, &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info); @@ -186,17 +186,13 @@ size_t net_rx_pkt_get_total_len(struct NetRxPkt *pkt) return pkt->tot_len; } -void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data, - size_t len) +void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, + const struct iovec *iov, size_t iovcnt, + size_t iovoff) { - const struct iovec iov = { - .iov_base = (void *)data, - .iov_len = len - }; - assert(pkt); - eth_get_protocols(&iov, 1, &pkt->hasip4, &pkt->hasip6, + eth_get_protocols(iov, iovcnt, iovoff, &pkt->hasip4, &pkt->hasip6, &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off, &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info); } diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index d00b484900..a06f5c2675 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -55,12 +55,14 @@ size_t net_rx_pkt_get_total_len(struct NetRxPkt *pkt); * parse and set packet analysis results * * @pkt: packet - * @data: pointer to the data buffer to be parsed - * @len: data length + * @iov: received data scatter-gather list + * @iovcnt: number of elements in iov + * @iovoff: data start offset in the iov * */ -void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data, - size_t len); +void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, + const struct iovec *iov, size_t iovcnt, + size_t iovoff); /** * fetches packet analysis results diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 4ea33b6e2e..af1e89706c 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1834,9 +1834,12 @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf, VIRTIO_NET_HASH_REPORT_UDPv6, VIRTIO_NET_HASH_REPORT_UDPv6_EX }; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = size + }; - net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len, - size - n->host_hdr_len); + net_rx_pkt_set_protocols(pkt, &iov, 1, n->host_hdr_len); net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto, n->rss_data.hash_types); diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index f7b874c139..cb52db96a2 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -2001,7 +2001,12 @@ vmxnet3_receive(NetClientState *nc, const uint8_t *buf, size_t size) get_eth_packet_type(PKT_GET_ETH_HDR(buf))); if (vmxnet3_rx_filter_may_indicate(s, buf, size)) { - net_rx_pkt_set_protocols(s->rx_pkt, buf, size); + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = size + }; + + net_rx_pkt_set_protocols(s->rx_pkt, &iov, 1, 0); vmxnet3_rx_need_csum_calculate(s->rx_pkt, buf, size); net_rx_pkt_attach_data(s->rx_pkt, buf, size, s->rx_vlan_stripping); bytes_indicated = vmxnet3_indicate_packet(s) ? size : -1; diff --git a/include/net/eth.h b/include/net/eth.h index c5ae4493b4..9f19c3a695 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -312,10 +312,10 @@ eth_get_l2_hdr_length(const void *p) } static inline uint32_t -eth_get_l2_hdr_length_iov(const struct iovec *iov, int iovcnt) +eth_get_l2_hdr_length_iov(const struct iovec *iov, size_t iovcnt, size_t iovoff) { uint8_t p[sizeof(struct eth_header) + sizeof(struct vlan_header)]; - size_t copied = iov_to_buf(iov, iovcnt, 0, p, ARRAY_SIZE(p)); + size_t copied = iov_to_buf(iov, iovcnt, iovoff, p, ARRAY_SIZE(p)); if (copied < ARRAY_SIZE(p)) { return copied; @@ -397,7 +397,7 @@ typedef struct eth_l4_hdr_info_st { bool has_tcp_data; } eth_l4_hdr_info; -void eth_get_protocols(const struct iovec *iov, int iovcnt, +void eth_get_protocols(const struct iovec *iov, size_t iovcnt, size_t iovoff, bool *hasip4, bool *hasip6, size_t *l3hdr_off, size_t *l4hdr_off, diff --git a/net/eth.c b/net/eth.c index 70bcd8e355..d7b30df79f 100644 --- a/net/eth.c +++ b/net/eth.c @@ -136,7 +136,7 @@ _eth_tcp_has_data(bool is_ip4, return l4len > TCP_HEADER_DATA_OFFSET(tcp); } -void eth_get_protocols(const struct iovec *iov, int iovcnt, +void eth_get_protocols(const struct iovec *iov, size_t iovcnt, size_t iovoff, bool *hasip4, bool *hasip6, size_t *l3hdr_off, size_t *l4hdr_off, @@ -147,26 +147,24 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt, { int proto; bool fragment = false; - size_t l2hdr_len = eth_get_l2_hdr_length_iov(iov, iovcnt); size_t input_size = iov_size(iov, iovcnt); size_t copied; uint8_t ip_p; *hasip4 = *hasip6 = false; + *l3hdr_off = iovoff + eth_get_l2_hdr_length_iov(iov, iovcnt, iovoff); l4hdr_info->proto = ETH_L4_HDR_PROTO_INVALID; - proto = eth_get_l3_proto(iov, iovcnt, l2hdr_len); - - *l3hdr_off = l2hdr_len; + proto = eth_get_l3_proto(iov, iovcnt, *l3hdr_off); if (proto == ETH_P_IP) { struct ip_header *iphdr = &ip4hdr_info->ip4_hdr; - if (input_size < l2hdr_len) { + if (input_size < *l3hdr_off) { return; } - copied = iov_to_buf(iov, iovcnt, l2hdr_len, iphdr, sizeof(*iphdr)); + copied = iov_to_buf(iov, iovcnt, *l3hdr_off, iphdr, sizeof(*iphdr)); if (copied < sizeof(*iphdr) || IP_HEADER_VERSION(iphdr) != IP_HEADER_VERSION_4) { return; @@ -175,17 +173,17 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt, *hasip4 = true; ip_p = iphdr->ip_p; ip4hdr_info->fragment = IP4_IS_FRAGMENT(iphdr); - *l4hdr_off = l2hdr_len + IP_HDR_GET_LEN(iphdr); + *l4hdr_off = *l3hdr_off + IP_HDR_GET_LEN(iphdr); fragment = ip4hdr_info->fragment; } else if (proto == ETH_P_IPV6) { - if (!eth_parse_ipv6_hdr(iov, iovcnt, l2hdr_len, ip6hdr_info)) { + if (!eth_parse_ipv6_hdr(iov, iovcnt, *l3hdr_off, ip6hdr_info)) { return; } *hasip6 = true; ip_p = ip6hdr_info->l4proto; - *l4hdr_off = l2hdr_len + ip6hdr_info->full_hdr_len; + *l4hdr_off = *l3hdr_off + ip6hdr_info->full_hdr_len; fragment = ip6hdr_info->fragment; } else { return; From c84bcff3d3877658adf0bca3d8087488ecf0f3d9 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:59 +0900 Subject: [PATCH 66/76] e1000e: Always copy ethernet header e1000e_receive_internal() used to check the iov length to determine copy the iovs to a contiguous buffer, but the check is flawed in two ways: - It does not ensure that iovcnt > 0. - It does not take virtio-net header into consideration. The size of this copy is just 18 octets, which can be even less than the code size required for checks. This (wrong) optimization is probably not worth so just remove it. Fixes: 6f3fbe4ed0 ("net: Introduce e1000e device emulation") Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang (cherry picked from commit 310a128eae12339f97f6c940a7ddf92f40d283e4) Signed-off-by: Michael Tokarev --- hw/net/e1000e_core.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index a74f1bc245..85b9eb799c 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1685,12 +1685,9 @@ static ssize_t e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, bool has_vnet) { - static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4); - uint32_t n = 0; - uint8_t min_buf[ETH_ZLEN]; + uint8_t buf[ETH_ZLEN]; struct iovec min_iov; - uint8_t *filter_buf; size_t size, orig_size; size_t iov_ofs = 0; E1000E_RxRing rxr; @@ -1713,24 +1710,21 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, net_rx_pkt_unset_vhdr(core->rx_pkt); } - filter_buf = iov->iov_base + iov_ofs; orig_size = iov_size(iov, iovcnt); size = orig_size - iov_ofs; /* Pad to minimum Ethernet frame length */ - if (size < sizeof(min_buf)) { - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size); - memset(&min_buf[size], 0, sizeof(min_buf) - size); + if (size < sizeof(buf)) { + iov_to_buf(iov, iovcnt, iov_ofs, buf, size); + memset(&buf[size], 0, sizeof(buf) - size); e1000x_inc_reg_if_not_full(core->mac, RUC); - min_iov.iov_base = filter_buf = min_buf; - min_iov.iov_len = size = sizeof(min_buf); + min_iov.iov_base = buf; + min_iov.iov_len = size = sizeof(buf); iovcnt = 1; iov = &min_iov; iov_ofs = 0; - } else if (iov->iov_len < maximum_ethernet_hdr_len) { - /* This is very unlikely, but may happen. */ - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len); - filter_buf = min_buf; + } else { + iov_to_buf(iov, iovcnt, iov_ofs, buf, ETH_HLEN + 4); } /* Discard oversized packets if !LPE and !SBP. */ @@ -1739,9 +1733,9 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, } net_rx_pkt_set_packet_type(core->rx_pkt, - get_eth_packet_type(PKT_GET_ETH_HDR(filter_buf))); + get_eth_packet_type(PKT_GET_ETH_HDR(buf))); - if (!e1000e_receive_filter(core, filter_buf, size)) { + if (!e1000e_receive_filter(core, buf, size)) { trace_e1000e_rx_flt_dropped(); return orig_size; } From 02bd13ae3acb3ff3c3c2bcea142d5aee880e3a88 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:00 +0900 Subject: [PATCH 67/76] igb: Always copy ethernet header igb_receive_internal() used to check the iov length to determine copy the iovs to a contiguous buffer, but the check is flawed in two ways: - It does not ensure that iovcnt > 0. - It does not take virtio-net header into consideration. The size of this copy is just 22 octets, which can be even less than the code size required for checks. This (wrong) optimization is probably not worth so just remove it. Removing this also allows igb to assume aligned accesses for the ethernet header. Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang (cherry picked from commit dc9ef1bf454811646b3ee6387f1b96f63f538a18) Signed-off-by: Michael Tokarev --- hw/net/igb_core.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index adcb8ac6f3..ce2f2186ab 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -67,6 +67,11 @@ typedef struct IGBTxPktVmdqCallbackContext { NetClientState *nc; } IGBTxPktVmdqCallbackContext; +typedef struct L2Header { + struct eth_header eth; + struct vlan_header vlan; +} L2Header; + static ssize_t igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx); @@ -960,15 +965,16 @@ igb_rx_is_oversized(IGBCore *core, uint16_t qn, size_t size) return size > (lpe ? max_ethernet_lpe_size : max_ethernet_vlan_size); } -static uint16_t igb_receive_assign(IGBCore *core, const struct eth_header *ehdr, +static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, size_t size, E1000E_RSSInfo *rss_info, bool *external_tx) { static const int ta_shift[] = { 4, 3, 2, 0 }; + const struct eth_header *ehdr = &l2_header->eth; uint32_t f, ra[2], *macp, rctl = core->mac[RCTL]; uint16_t queues = 0; uint16_t oversized = 0; - uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(ehdr)->h_tci) & VLAN_VID_MASK; + uint16_t vid = be16_to_cpu(l2_header->vlan.h_tci) & VLAN_VID_MASK; bool accepted = false; int i; @@ -1589,14 +1595,13 @@ static ssize_t igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx) { - static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4); - uint16_t queues = 0; uint32_t n = 0; - uint8_t min_buf[ETH_ZLEN]; + union { + L2Header l2_header; + uint8_t octets[ETH_ZLEN]; + } buf; struct iovec min_iov; - struct eth_header *ehdr; - uint8_t *filter_buf; size_t size, orig_size; size_t iov_ofs = 0; E1000E_RxRing rxr; @@ -1622,24 +1627,21 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, net_rx_pkt_unset_vhdr(core->rx_pkt); } - filter_buf = iov->iov_base + iov_ofs; orig_size = iov_size(iov, iovcnt); size = orig_size - iov_ofs; /* Pad to minimum Ethernet frame length */ - if (size < sizeof(min_buf)) { - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size); - memset(&min_buf[size], 0, sizeof(min_buf) - size); + if (size < sizeof(buf)) { + iov_to_buf(iov, iovcnt, iov_ofs, &buf, size); + memset(&buf.octets[size], 0, sizeof(buf) - size); e1000x_inc_reg_if_not_full(core->mac, RUC); - min_iov.iov_base = filter_buf = min_buf; - min_iov.iov_len = size = sizeof(min_buf); + min_iov.iov_base = &buf; + min_iov.iov_len = size = sizeof(buf); iovcnt = 1; iov = &min_iov; iov_ofs = 0; - } else if (iov->iov_len < maximum_ethernet_hdr_len) { - /* This is very unlikely, but may happen. */ - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len); - filter_buf = min_buf; + } else { + iov_to_buf(iov, iovcnt, iov_ofs, &buf, sizeof(buf.l2_header)); } /* Discard oversized packets if !LPE and !SBP. */ @@ -1647,11 +1649,12 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, return orig_size; } - ehdr = PKT_GET_ETH_HDR(filter_buf); - net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr)); + net_rx_pkt_set_packet_type(core->rx_pkt, + get_eth_packet_type(&buf.l2_header.eth)); net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs); - queues = igb_receive_assign(core, ehdr, size, &rss_info, external_tx); + queues = igb_receive_assign(core, &buf.l2_header, size, + &rss_info, external_tx); if (!queues) { trace_e1000e_rx_flt_dropped(); return orig_size; From fae9449998e73cb52ddfc6453bee49cf3a518372 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 13 Apr 2023 13:19:46 -0400 Subject: [PATCH 68/76] rtl8139: fix large_send_mss divide-by-zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the driver sets large_send_mss to 0 then a divide-by-zero occurs. Even if the division wasn't a problem, the for loop that emits MSS-sized packets would never terminate. Solve these issues by skipping offloading when large_send_mss=0. This issue was found by OSS-Fuzz as part of Alexander Bulekov's device fuzzing work. The reproducer is: $ cat << EOF | ./qemu-system-i386 -display none -machine accel=qtest, -m \ 512M,slots=1,maxmem=0xffff000000000000 -machine q35 -nodefaults -device \ rtl8139,netdev=net0 -netdev user,id=net0 -device \ pc-dimm,id=nv1,memdev=mem1,addr=0xb800a64602800000 -object \ memory-backend-ram,id=mem1,size=2M -qtest stdio outl 0xcf8 0x80000814 outl 0xcfc 0xe0000000 outl 0xcf8 0x80000804 outw 0xcfc 0x06 write 0xe0000037 0x1 0x04 write 0xe00000e0 0x2 0x01 write 0x1 0x1 0x04 write 0x3 0x1 0x98 write 0xa 0x1 0x8c write 0xb 0x1 0x02 write 0xc 0x1 0x46 write 0xd 0x1 0xa6 write 0xf 0x1 0xb8 write 0xb800a646028c000c 0x1 0x08 write 0xb800a646028c000e 0x1 0x47 write 0xb800a646028c0010 0x1 0x02 write 0xb800a646028c0017 0x1 0x06 write 0xb800a646028c0036 0x1 0x80 write 0xe00000d9 0x1 0x40 EOF Buglink: https://gitlab.com/qemu-project/qemu/-/issues/1582 Closes: https://gitlab.com/qemu-project/qemu/-/issues/1582 Cc: qemu-stable@nongnu.org Cc: Peter Maydell Fixes: 6d71357a3b65 ("rtl8139: honor large send MSS value") Reported-by: Alexander Bulekov Reviewed-by: Philippe Mathieu-Daudé Tested-by: Alexander Bulekov Signed-off-by: Stefan Hajnoczi Signed-off-by: Jason Wang (cherry picked from commit 792676c165159c11412346870fd58fd243ab2166) Signed-off-by: Michael Tokarev --- hw/net/rtl8139.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c index 5a5aaf868d..5f1a4d359b 100644 --- a/hw/net/rtl8139.c +++ b/hw/net/rtl8139.c @@ -2154,6 +2154,9 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) int large_send_mss = (txdw0 >> CP_TC_LGSEN_MSS_SHIFT) & CP_TC_LGSEN_MSS_MASK; + if (large_send_mss == 0) { + goto skip_offload; + } DPRINTF("+++ C+ mode offloaded task TSO IP data %d " "frame data %d specified MSS=%d\n", From 668aeea0ecb651ce3d8e42cc4fa59e29dd9f009e Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:39:12 +0900 Subject: [PATCH 69/76] util/vfio-helpers: Use g_file_read_link() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When _FORTIFY_SOURCE=2, glibc version is 2.35, and GCC version is 12.1.0, the compiler complains as follows: In file included from /usr/include/features.h:490, from /usr/include/bits/libc-header-start.h:33, from /usr/include/stdint.h:26, from /usr/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/include/stdint.h:9, from /home/alarm/q/var/qemu/include/qemu/osdep.h:94, from ../util/vfio-helpers.c:13: In function 'readlink', inlined from 'sysfs_find_group_file' at ../util/vfio-helpers.c:116:9, inlined from 'qemu_vfio_init_pci' at ../util/vfio-helpers.c:326:18, inlined from 'qemu_vfio_open_pci' at ../util/vfio-helpers.c:517:9: /usr/include/bits/unistd.h:119:10: error: argument 2 is null but the corresponding size argument 3 value is 4095 [-Werror=nonnull] 119 | return __glibc_fortify (readlink, __len, sizeof (char), | ^~~~~~~~~~~~~~~ This error implies the allocated buffer can be NULL. Use g_file_read_link(), which allocates buffer automatically to avoid the error. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater (cherry picked from commit dbdea0dbfe2cef9ef6c752e9077e4fc98724194c) Signed-off-by: Michael Tokarev --- util/vfio-helpers.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 2d8af38f88..f8bab46c68 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -106,15 +106,17 @@ struct QEMUVFIOState { */ static char *sysfs_find_group_file(const char *device, Error **errp) { + g_autoptr(GError) gerr = NULL; char *sysfs_link; char *sysfs_group; char *p; char *path = NULL; sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); - sysfs_group = g_malloc0(PATH_MAX); - if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { - error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); + sysfs_group = g_file_read_link(sysfs_link, &gerr); + if (gerr) { + error_setg(errp, "Failed to find iommu group sysfs path: %s", + gerr->message); goto out; } p = strrchr(sysfs_group, '/'); From 9d622451fdf9693a2265d5c04b041627f81d8c1d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 May 2023 17:58:40 +0200 Subject: [PATCH 70/76] usb/ohci: Set pad to 0 after frame update When the OHCI controller's framenumber is incremented, HccaPad1 register should be set to zero (Ref OHCI Spec 4.4) ReactOS uses hccaPad1 to determine if the OHCI hardware is running, consequently it fails this check in current qemu master. Signed-off-by: Ryan Wendland Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1048 Signed-off-by: Paolo Bonzini (cherry picked from commit 6301460ce9f59885e8feb65185bcfb6b128c8eff) Signed-off-by: Michael Tokarev --- hw/usb/hcd-ohci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c index 88d2b4b13c..cc5cde6983 100644 --- a/hw/usb/hcd-ohci.c +++ b/hw/usb/hcd-ohci.c @@ -1239,6 +1239,8 @@ static void ohci_frame_boundary(void *opaque) /* Increment frame number and take care of endianness. */ ohci->frame_number = (ohci->frame_number + 1) & 0xffff; hcca.frame = cpu_to_le16(ohci->frame_number); + /* When the HC updates frame number, set pad to 0. Ref OHCI Spec 4.4.1*/ + hcca.pad = 0; if (ohci->done_count == 0 && !(ohci->intr_status & OHCI_INTR_WD)) { if (!ohci->done) { From e49884a90987744ddb54b2fadc770633eb6a4d62 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 22 May 2023 11:10:11 +0200 Subject: [PATCH 71/76] hw/scsi/lsi53c895a: Fix reentrancy issues in the LSI controller (CVE-2023-0330) We cannot use the generic reentrancy guard in the LSI code, so we have to manually prevent endless reentrancy here. The problematic lsi_execute_script() function has already a way to detect whether too many instructions have been executed - we just have to slightly change the logic here that it also takes into account if the function has been called too often in a reentrant way. The code in fuzz-lsi53c895a-test.c has been taken from an earlier patch by Mauro Matteo Cascella. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1563 Message-Id: <20230522091011.1082574-1-thuth@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Alexander Bulekov Signed-off-by: Thomas Huth (cherry picked from commit b987718bbb1d0eabf95499b976212dd5f0120d75) Signed-off-by: Michael Tokarev --- hw/scsi/lsi53c895a.c | 23 +++++++++++++++------ tests/qtest/fuzz-lsi53c895a-test.c | 33 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c index af93557a9a..bbf32d3f73 100644 --- a/hw/scsi/lsi53c895a.c +++ b/hw/scsi/lsi53c895a.c @@ -1134,15 +1134,24 @@ static void lsi_execute_script(LSIState *s) uint32_t addr, addr_high; int opcode; int insn_processed = 0; + static int reentrancy_level; + + reentrancy_level++; s->istat1 |= LSI_ISTAT1_SRUN; again: - if (++insn_processed > LSI_MAX_INSN) { - /* Some windows drivers make the device spin waiting for a memory - location to change. If we have been executed a lot of code then - assume this is the case and force an unexpected device disconnect. - This is apparently sufficient to beat the drivers into submission. - */ + /* + * Some windows drivers make the device spin waiting for a memory location + * to change. If we have executed more than LSI_MAX_INSN instructions then + * assume this is the case and force an unexpected device disconnect. This + * is apparently sufficient to beat the drivers into submission. + * + * Another issue (CVE-2023-0330) can occur if the script is programmed to + * trigger itself again and again. Avoid this problem by stopping after + * being called multiple times in a reentrant way (8 is an arbitrary value + * which should be enough for all valid use cases). + */ + if (++insn_processed > LSI_MAX_INSN || reentrancy_level > 8) { if (!(s->sien0 & LSI_SIST0_UDC)) { qemu_log_mask(LOG_GUEST_ERROR, "lsi_scsi: inf. loop with UDC masked"); @@ -1596,6 +1605,8 @@ again: } } trace_lsi_execute_script_stop(); + + reentrancy_level--; } static uint8_t lsi_reg_readb(LSIState *s, int offset) diff --git a/tests/qtest/fuzz-lsi53c895a-test.c b/tests/qtest/fuzz-lsi53c895a-test.c index 2012bd54b7..1b55928b9f 100644 --- a/tests/qtest/fuzz-lsi53c895a-test.c +++ b/tests/qtest/fuzz-lsi53c895a-test.c @@ -8,6 +8,36 @@ #include "qemu/osdep.h" #include "libqtest.h" +/* + * This used to trigger a DMA reentrancy issue + * leading to memory corruption bugs like stack + * overflow or use-after-free + * https://gitlab.com/qemu-project/qemu/-/issues/1563 + */ +static void test_lsi_dma_reentrancy(void) +{ + QTestState *s; + + s = qtest_init("-M q35 -m 512M -nodefaults " + "-blockdev driver=null-co,node-name=null0 " + "-device lsi53c810 -device scsi-cd,drive=null0"); + + qtest_outl(s, 0xcf8, 0x80000804); /* PCI Command Register */ + qtest_outw(s, 0xcfc, 0x7); /* Enables accesses */ + qtest_outl(s, 0xcf8, 0x80000814); /* Memory Bar 1 */ + qtest_outl(s, 0xcfc, 0xff100000); /* Set MMIO Address*/ + qtest_outl(s, 0xcf8, 0x80000818); /* Memory Bar 2 */ + qtest_outl(s, 0xcfc, 0xff000000); /* Set RAM Address*/ + qtest_writel(s, 0xff000000, 0xc0000024); + qtest_writel(s, 0xff000114, 0x00000080); + qtest_writel(s, 0xff00012c, 0xff000000); + qtest_writel(s, 0xff000004, 0xff000114); + qtest_writel(s, 0xff000008, 0xff100014); + qtest_writel(s, 0xff10002f, 0x000000ff); + + qtest_quit(s); +} + /* * This used to trigger a UAF in lsi_do_msgout() * https://gitlab.com/qemu-project/qemu/-/issues/972 @@ -124,5 +154,8 @@ int main(int argc, char **argv) qtest_add_func("fuzz/lsi53c895a/lsi_do_msgout_cancel_req", test_lsi_do_msgout_cancel_req); + qtest_add_func("fuzz/lsi53c895a/lsi_dma_reentrancy", + test_lsi_dma_reentrancy); + return g_test_run(); } From 134253a4fe9a1307308f2908424c354f989ee52f Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 22 May 2023 15:17:17 +0200 Subject: [PATCH 72/76] machine: do not crash if default RAM backend name has been stolen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU aborts when default RAM backend should be used (i.e. no explicit '-machine memory-backend=' specified) but user has created an object which 'id' equals to default RAM backend name used by board. $QEMU -machine pc \ -object memory-backend-ram,id=pc.ram,size=4294967296 Actual results: QEMU 7.2.0 monitor - type 'help' for more information (qemu) Unexpected error in object_property_try_add() at ../qom/object.c:1239: qemu-kvm: attempt to add duplicate property 'pc.ram' to object (type 'container') Aborted (core dumped) Instead of abort, check for the conflicting 'id' and exit with an error, suggesting how to remedy the issue. Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2207886 Signed-off-by: Igor Mammedov Message-Id: <20230522131717.3780533-1-imammedo@redhat.com> Tested-by: Thomas Huth Reviewed-by: Thomas Huth Reviewed-by: Shaoqin Huang Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Huth (cherry picked from commit a37531f2381c4e294e48b1417089474128388b44) Signed-off-by: Michael Tokarev --- hw/core/machine.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/core/machine.c b/hw/core/machine.c index 5060119952..2f6ccf5623 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -1333,6 +1333,14 @@ void machine_run_board_init(MachineState *machine, const char *mem_path, Error * } } else if (machine_class->default_ram_id && machine->ram_size && numa_uses_legacy_mem()) { + if (object_property_find(object_get_objects_root(), + machine_class->default_ram_id)) { + error_setg(errp, "object name '%s' is reserved for the default" + " RAM backend, it can't be used for any other purposes." + " Change the object's 'id' to something else", + machine_class->default_ram_id); + return; + } if (!create_default_memdev(current_machine, mem_path, errp)) { return; } From ff692a15bbc13b6a12ce9d5e6802bdfc209c1ea3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 15 May 2023 15:26:04 +0200 Subject: [PATCH 73/76] virtio: qmp: fix memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VirtioInfoList is already allocated by QAPI_LIST_PREPEND and need not be allocated by the caller. Fixes Coverity CID 1508724. Reviewed-by: Daniel P. Berrangé Signed-off-by: Paolo Bonzini (cherry picked from commit 0bfd14149b248e8097ea4da1f9d53beb5c5b0cca) Signed-off-by: Michael Tokarev --- hw/virtio/virtio-qmp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c index b70148aba9..3d7ce2ea2f 100644 --- a/hw/virtio/virtio-qmp.c +++ b/hw/virtio/virtio-qmp.c @@ -666,7 +666,7 @@ VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) VirtioInfoList *qmp_x_query_virtio(Error **errp) { VirtioInfoList *list = NULL; - VirtioInfoList *node; + VirtioInfo *node; VirtIODevice *vdev; QTAILQ_FOREACH(vdev, &virtio_list, next) { @@ -680,11 +680,10 @@ VirtioInfoList *qmp_x_query_virtio(Error **errp) if (!strncmp(is_realized->str, "false", 4)) { QTAILQ_REMOVE(&virtio_list, vdev, next); } else { - node = g_new0(VirtioInfoList, 1); - node->value = g_new(VirtioInfo, 1); - node->value->path = g_strdup(dev->canonical_path); - node->value->name = g_strdup(vdev->name); - QAPI_LIST_PREPEND(list, node->value); + node = g_new(VirtioInfo, 1); + node->path = g_strdup(dev->canonical_path); + node->name = g_strdup(vdev->name); + QAPI_LIST_PREPEND(list, node); } g_string_free(is_realized, true); } From dabb4183d17dc11c44d7deb840151b79a53673af Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Mon, 29 May 2023 18:19:22 +0300 Subject: [PATCH 74/76] Update version for 8.0.1 release Signed-off-by: Michael Tokarev --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ae9a76b924..cd1d2e94f3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -8.0.0 +8.0.1 From bea933e430249fb81e1c020ddf50511d0481d56e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 10 May 2023 22:35:55 +0200 Subject: [PATCH 75/76] block/export: Fix null pointer dereference in error path There are some error paths in blk_exp_add() that jump to 'fail:' before 'exp' is even created. So we can't just unconditionally access exp->blk. Add a NULL check, and switch from exp->blk to blk, which is available earlier, just to be extra sure that we really cover all cases where BlockDevOps could have been set for it (in practice, this only happens in drv->create() today, so this part of the change isn't strictly necessary). Fixes: Coverity CID 1509238 Fixes: de79b52604e43fdeba6cee4f5af600b62169f2d2 Signed-off-by: Kevin Wolf Message-Id: <20230510203601.418015-3-kwolf@redhat.com> Reviewed-by: Eric Blake Tested-by: Eric Blake Signed-off-by: Kevin Wolf (cherry picked from commit a184563778f2b8970eb93291f08108e66432a575) Signed-off-by: Michael Tokarev --- block/export/export.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/export/export.c b/block/export/export.c index cdef298902..33f8f8ffa4 100644 --- a/block/export/export.c +++ b/block/export/export.c @@ -192,8 +192,10 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp) return exp; fail: - blk_set_dev_ops(exp->blk, NULL, NULL); - blk_unref(blk); + if (blk) { + blk_set_dev_ops(blk, NULL, NULL); + blk_unref(blk); + } aio_context_release(ctx); if (exp) { g_free(exp->id); From f7f686b61cf7ee142c9264d2e04ac2c6a96d37f8 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Tue, 30 May 2023 14:59:47 +0300 Subject: [PATCH 76/76] Update version for 8.0.2 release Signed-off-by: Michael Tokarev --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index cd1d2e94f3..8b22a322d0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -8.0.1 +8.0.2