среда

[Bug 2111521] Re: nvme no longer detected on boot after upgrade to 6.8.0-60

@Marks,

Could you help test a new kernel, the kernel applied the patch below as
requested by community, and help collect dmesg with "dyndbg="file
drivers/pci/* +p", thanks.

https://people.canonical.com/~hwang4/lp2111521/test13/

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 9e42090fb108..86012399597e 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1264,10 +1264,13 @@ void pci_resume_bus(struct pci_bus *bus)

static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
{
- int delay = 1;
+ int delay = 10;
bool retrain = false;
struct pci_dev *root, *bridge;
+ u16 devctl, devsta;

+ pci_info(dev, "%s: VF%c %s timeout %d\n", __func__,
+ dev->is_virtfn ? '+' : '-', reset_type, timeout);
root = pcie_find_root_port(dev);

if (pci_is_pcie(dev)) {
@@ -1276,6 +1279,19 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
retrain = true;
}

+ if (root) {
+ pcie_capability_read_word(root, PCI_EXP_DEVCTL, &devctl);
+ if (!(devctl & PCI_EXP_DEVCTL_URRE))
+ pcie_capability_write_word(root, PCI_EXP_DEVCTL,
+ devctl | PCI_EXP_DEVCTL_URRE);
+ pcie_capability_read_word(root, PCI_EXP_DEVSTA, &devsta);
+ if (devsta & PCI_EXP_DEVSTA_URD)
+ pcie_capability_write_word(root, PCI_EXP_DEVSTA,
+ PCI_EXP_DEVSTA_URD);
+ pci_info(root, "%s: DEVCTL %#06x DEVSTA %#06x\n", __func__,
+ devctl, devsta);
+ }
+
/*
* The caller has already waited long enough after a reset that the
* device should respond to config requests, but it may respond
@@ -1305,14 +1321,33 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)

if (root && root->config_rrs_sv) {
pci_read_config_dword(dev, PCI_VENDOR_ID, &id);
- if (!pci_bus_rrs_vendor_id(id))
- break;
+
+ if (pci_bus_rrs_vendor_id(id)) {
+ pci_info(dev, "%s: read %#06x (RRS)\n",
+ __func__, id);
+ goto retry;
+ }
+
+ if (PCI_POSSIBLE_ERROR(id)) {
+ pcie_capability_read_word(root, PCI_EXP_DEVSTA,
+ &devsta);
+ if (devsta & PCI_EXP_DEVSTA_URD)
+ pcie_capability_write_word(root,
+ PCI_EXP_DEVSTA,
+ PCI_EXP_DEVSTA_URD);
+ pci_info(root, "%s: read %#06x DEVSTA %#06x\n",
+ __func__, id, devsta);
+ goto retry;
+ }
+
+ break;
} else {
pci_read_config_dword(dev, PCI_COMMAND, &id);
if (!PCI_POSSIBLE_ERROR(id))
break;
}

+retry:
if (delay > timeout) {
pci_warn(dev, "not ready %dms after %s; giving up\n",
delay - 1, reset_type);
@@ -1332,7 +1367,6 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
}

msleep(delay);
- delay *= 2;
}

if (delay > PCI_RESET_WAIT)
@@ -4671,8 +4705,10 @@ static int pcie_wait_for_link_status(struct pci_dev *pdev,
end_jiffies = jiffies + msecs_to_jiffies(PCIE_LINK_RETRAIN_TIMEOUT_MS);
do {
pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
- if ((lnksta & lnksta_mask) == lnksta_match)
+ if ((lnksta & lnksta_mask) == lnksta_match) {
+ pci_info(pdev, "%s: LNKSTA %#06x\n", __func__, lnksta);
return 0;
+ }
msleep(1);
} while (time_before(jiffies, end_jiffies));

@@ -4761,6 +4797,8 @@ static bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active,
* Some controllers might not implement link active reporting. In this
* case, we wait for 1000 ms + any delay requested by the caller.
*/
+ pci_info(pdev, "%s: active %d delay %d link_active_reporting %d\n",
+ __func__, active, delay, pdev->link_active_reporting);
if (!pdev->link_active_reporting) {
msleep(PCIE_LINK_RETRAIN_TIMEOUT_MS + delay);
return true;
@@ -4785,6 +4823,7 @@ static bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active,
return false;

msleep(delay);
+ pci_info(pdev, "%s: waited %dms\n", __func__, delay);
return true;
}

@@ -4961,6 +5000,7 @@ void pci_reset_secondary_bus(struct pci_dev *dev)

ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+ pci_info(dev, "%s: PCI_BRIDGE_CTL_BUS_RESET deasserted\n", __func__);
}

void __weak pcibios_reset_secondary_bus(struct pci_dev *dev)

--
You received this bug notification because you are subscribed to linux
in Ubuntu.
Matching subscriptions: Bgg, Bmail, Nb
https://bugs.launchpad.net/bugs/2111521

Title:
nvme no longer detected on boot after upgrade to 6.8.0-60

Status in linux package in Ubuntu:
Triaged

Bug description:
Short version: booting 6.8.0-59-generic or any earlier version from
the grub menu works; 6.8.0-60-generic dumps me at the initramfs prompt
with no disks.

We have some servers running Ubuntu 24.04.2 LTS. They have NVME
solid-state disks which (in a working kernel) are detected as follows:

[ 3.537968] nvme nvme0: pci function 10000:01:00.0
[ 3.539285] nvme 10000:01:00.0: PCI INT A: no GSI
[ 5.897819] nvme nvme0: 32/0/0 default/read/poll queues
[ 5.905451] nvme nvme0: Ignoring bogus Namespace Identifiers
[ 5.909057] nvme0n1: p1 p2 p3

On the PCI bus they look like this:
10000:01:00.0 Non-Volatile memory controller [0108]: Intel Corporation NVMe Datacenter SSD [3DNAND, Beta Rock Controller] [8086:0a54]
$ ls -l /sys/class/nvme/nvme0
lrwxrwxrwx 1 root root 0 May 22 16:56 /sys/class/nvme/nvme0 -> ../../devices/pci0000:d7/0000:d7:05.5/pci10000:00/10000:00:02.0/10000:01:00.0/nvme/nvme0

Four identical servers updated their kernel this morning to:
ii linux-image-6.8.0-60-generic 6.8.0-60.63 amd64 Signed kernel image generic

...and rebooted. All four failed to come up and ended up at the
(initramfs) prompt. Rebooting and selecting 6.8.0-59-generic from the
grub menu allowed them to boot as normal.

There is no sign that the initramfs generation went wrong (on all four
servers) and the initramfs does contain all the same nvme modules for
-60 that the one for -59 does. I am at a loss to explain this, and
the initramfs environment is a bit limited for debugging.

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2111521/+subscriptions

Комментариев нет:

Отправить комментарий