Discussion:
[PATCH 0/2] crypto: qat - Fix for invalid dma mapping and numa
Tadeusz Struk
2014-10-08 17:38:40 UTC
Permalink
Hi,
These two patches fix invalid (zero length) dma mapping and
enforce numa configuration for maximum performance.

Signed-off-by: Tadeusz Struk <***@intel.com>
---

Tadeusz Struk (2):
crypto: qat - Prevent dma mapping zero length assoc data.
crypto: qat - Enforce valid numa configuration.


drivers/crypto/qat/qat_common/adf_accel_devices.h | 3 +--
drivers/crypto/qat/qat_common/adf_transport.c | 12 +++++++-----
drivers/crypto/qat/qat_common/qat_algs.c | 7 +++++--
drivers/crypto/qat/qat_common/qat_crypto.c | 8 +++++---
drivers/crypto/qat/qat_dh895xcc/adf_admin.c | 2 +-
drivers/crypto/qat/qat_dh895xcc/adf_drv.c | 9 ++++++++-
drivers/crypto/qat/qat_dh895xcc/adf_isr.c | 2 +-
7 files changed, 28 insertions(+), 15 deletions(-)

--
Tadeusz Struk
2014-10-08 17:38:53 UTC
Permalink
In a system with NUMA configuration we want to enforce that the accelerator is
connected to a node with memory to avoid cross QPI memory transaction.
Otherwise there is no point in using the accelerator as the encryption in
software will be faster.

Signed-off-by: Tadeusz Struk <***@intel.com>
---
drivers/crypto/qat/qat_common/adf_accel_devices.h | 3 +--
drivers/crypto/qat/qat_common/adf_transport.c | 12 +++++++-----
drivers/crypto/qat/qat_common/qat_algs.c | 5 +++--
drivers/crypto/qat/qat_common/qat_crypto.c | 8 +++++---
drivers/crypto/qat/qat_dh895xcc/adf_admin.c | 2 +-
drivers/crypto/qat/qat_dh895xcc/adf_drv.c | 9 ++++++++-
drivers/crypto/qat/qat_dh895xcc/adf_isr.c | 2 +-
7 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/drivers/crypto/qat/qat_common/adf_accel_devices.h b/drivers/crypto/qat/qat_common/adf_accel_devices.h
index 3cfe195..96e0b06 100644
--- a/drivers/crypto/qat/qat_common/adf_accel_devices.h
+++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h
@@ -203,8 +203,7 @@ struct adf_accel_dev {
struct dentry *debugfs_dir;
struct list_head list;
struct module *owner;
- uint8_t accel_id;
- uint8_t numa_node;
struct adf_accel_pci accel_pci_dev;
+ uint8_t accel_id;
} __packed;
#endif
diff --git a/drivers/crypto/qat/qat_common/adf_transport.c b/drivers/crypto/qat/qat_common/adf_transport.c
index 5f3fa45..9dd2cb7 100644
--- a/drivers/crypto/qat/qat_common/adf_transport.c
+++ b/drivers/crypto/qat/qat_common/adf_transport.c
@@ -419,9 +419,10 @@ static int adf_init_bank(struct adf_accel_dev *accel_dev,
WRITE_CSR_RING_BASE(csr_addr, bank_num, i, 0);
ring = &bank->rings[i];
if (hw_data->tx_rings_mask & (1 << i)) {
- ring->inflights = kzalloc_node(sizeof(atomic_t),
- GFP_KERNEL,
- accel_dev->numa_node);
+ ring->inflights =
+ kzalloc_node(sizeof(atomic_t),
+ GFP_KERNEL,
+ dev_to_node(&GET_DEV(accel_dev)));
if (!ring->inflights)
goto err;
} else {
@@ -469,13 +470,14 @@ int adf_init_etr_data(struct adf_accel_dev *accel_dev)
int i, ret;

etr_data = kzalloc_node(sizeof(*etr_data), GFP_KERNEL,
- accel_dev->numa_node);
+ dev_to_node(&GET_DEV(accel_dev)));
if (!etr_data)
return -ENOMEM;

num_banks = GET_MAX_BANKS(accel_dev);
size = num_banks * sizeof(struct adf_etr_bank_data);
- etr_data->banks = kzalloc_node(size, GFP_KERNEL, accel_dev->numa_node);
+ etr_data->banks = kzalloc_node(size, GFP_KERNEL,
+ dev_to_node(&GET_DEV(accel_dev)));
if (!etr_data->banks) {
ret = -ENOMEM;
goto err_bank;
diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index bffa8bf..0897a1c 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -598,7 +598,8 @@ static int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst,
if (unlikely(!n))
return -EINVAL;

- bufl = kmalloc_node(sz, GFP_ATOMIC, inst->accel_dev->numa_node);
+ bufl = kmalloc_node(sz, GFP_ATOMIC,
+ dev_to_node(&GET_DEV(inst->accel_dev)));
if (unlikely(!bufl))
return -ENOMEM;

@@ -644,7 +645,7 @@ static int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst,
struct qat_alg_buf *bufers;

buflout = kmalloc_node(sz, GFP_ATOMIC,
- inst->accel_dev->numa_node);
+ dev_to_node(&GET_DEV(inst->accel_dev)));
if (unlikely(!buflout))
goto err;
bloutp = dma_map_single(dev, buflout, sz, DMA_TO_DEVICE);
diff --git a/drivers/crypto/qat/qat_common/qat_crypto.c b/drivers/crypto/qat/qat_common/qat_crypto.c
index 060dc0a..c1eefc4 100644
--- a/drivers/crypto/qat/qat_common/qat_crypto.c
+++ b/drivers/crypto/qat/qat_common/qat_crypto.c
@@ -109,12 +109,14 @@ struct qat_crypto_instance *qat_crypto_get_instance_node(int node)

list_for_each(itr, adf_devmgr_get_head()) {
accel_dev = list_entry(itr, struct adf_accel_dev, list);
- if (accel_dev->numa_node == node && adf_dev_started(accel_dev))
+ if ((node == dev_to_node(&GET_DEV(accel_dev)) ||
+ dev_to_node(&GET_DEV(accel_dev)) < 0)
+ && adf_dev_started(accel_dev))
break;
accel_dev = NULL;
}
if (!accel_dev) {
- pr_err("QAT: Could not find a device on the given node\n");
+ pr_err("QAT: Could not find a device on node %d\n", node);
accel_dev = adf_devmgr_get_first();
}
if (!accel_dev || !adf_dev_started(accel_dev))
@@ -164,7 +166,7 @@ static int qat_crypto_create_instances(struct adf_accel_dev *accel_dev)

for (i = 0; i < num_inst; i++) {
inst = kzalloc_node(sizeof(*inst), GFP_KERNEL,
- accel_dev->numa_node);
+ dev_to_node(&GET_DEV(accel_dev)));
if (!inst)
goto err;

diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_admin.c b/drivers/crypto/qat/qat_dh895xcc/adf_admin.c
index 978d6c5..53c491b 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_admin.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_admin.c
@@ -108,7 +108,7 @@ int adf_init_admin_comms(struct adf_accel_dev *accel_dev)
uint64_t reg_val;

admin = kzalloc_node(sizeof(*accel_dev->admin), GFP_KERNEL,
- accel_dev->numa_node);
+ dev_to_node(&GET_DEV(accel_dev)));
if (!admin)
return -ENOMEM;
admin->virt_addr = dma_zalloc_coherent(&GET_DEV(accel_dev), PAGE_SIZE,
diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
index 84edf17..40202cc 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
@@ -244,11 +244,18 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
}

node = adf_get_dev_node_id(pdev);
+ if (node != dev_to_node(&pdev->dev) && dev_to_node(&pdev->dev) > 0) {
+ /* If the accelerator is connected to a node with no memory
+ * there is no point in using the accelerator since the remote
+ * memory transaction will be very slow. */
+ dev_err(&pdev->dev, "Invalid NUMA configuration.\n");
+ return -EINVAL;
+ }
+
accel_dev = kzalloc_node(sizeof(*accel_dev), GFP_KERNEL, node);
if (!accel_dev)
return -ENOMEM;

- accel_dev->numa_node = node;
INIT_LIST_HEAD(&accel_dev->crypto_list);

/* Add accel device to accel table.
diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_isr.c b/drivers/crypto/qat/qat_dh895xcc/adf_isr.c
index c9212b9..fe8f896 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_isr.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_isr.c
@@ -168,7 +168,7 @@ static int adf_isr_alloc_msix_entry_table(struct adf_accel_dev *accel_dev)
uint32_t msix_num_entries = hw_data->num_banks + 1;

entries = kzalloc_node(msix_num_entries * sizeof(*entries),
- GFP_KERNEL, accel_dev->numa_node);
+ GFP_KERNEL, dev_to_node(&GET_DEV(accel_dev)));
if (!entries)
return -ENOMEM;
Prarit Bhargava
2014-10-08 17:57:22 UTC
Permalink
Post by Tadeusz Struk
In a system with NUMA configuration we want to enforce that the accelerator is
connected to a node with memory to avoid cross QPI memory transaction.
Otherwise there is no point in using the accelerator as the encryption in
software will be faster.
---
drivers/crypto/qat/qat_common/adf_accel_devices.h | 3 +--
drivers/crypto/qat/qat_common/adf_transport.c | 12 +++++++-----
drivers/crypto/qat/qat_common/qat_algs.c | 5 +++--
drivers/crypto/qat/qat_common/qat_crypto.c | 8 +++++---
drivers/crypto/qat/qat_dh895xcc/adf_admin.c | 2 +-
drivers/crypto/qat/qat_dh895xcc/adf_drv.c | 9 ++++++++-
drivers/crypto/qat/qat_dh895xcc/adf_isr.c | 2 +-
7 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/drivers/crypto/qat/qat_common/adf_accel_devices.h b/drivers/crypto/qat/qat_common/adf_accel_devices.h
index 3cfe195..96e0b06 100644
--- a/drivers/crypto/qat/qat_common/adf_accel_devices.h
+++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h
@@ -203,8 +203,7 @@ struct adf_accel_dev {
struct dentry *debugfs_dir;
struct list_head list;
struct module *owner;
- uint8_t accel_id;
- uint8_t numa_node;
struct adf_accel_pci accel_pci_dev;
+ uint8_t accel_id;
} __packed;
#endif
diff --git a/drivers/crypto/qat/qat_common/adf_transport.c b/drivers/crypto/qat/qat_common/adf_transport.c
index 5f3fa45..9dd2cb7 100644
--- a/drivers/crypto/qat/qat_common/adf_transport.c
+++ b/drivers/crypto/qat/qat_common/adf_transport.c
@@ -419,9 +419,10 @@ static int adf_init_bank(struct adf_accel_dev *accel_dev,
WRITE_CSR_RING_BASE(csr_addr, bank_num, i, 0);
ring = &bank->rings[i];
if (hw_data->tx_rings_mask & (1 << i)) {
- ring->inflights = kzalloc_node(sizeof(atomic_t),
- GFP_KERNEL,
- accel_dev->numa_node);
+ ring->inflights =
+ kzalloc_node(sizeof(atomic_t),
+ GFP_KERNEL,
+ dev_to_node(&GET_DEV(accel_dev)));
if (!ring->inflights)
goto err;
} else {
@@ -469,13 +470,14 @@ int adf_init_etr_data(struct adf_accel_dev *accel_dev)
int i, ret;
etr_data = kzalloc_node(sizeof(*etr_data), GFP_KERNEL,
- accel_dev->numa_node);
+ dev_to_node(&GET_DEV(accel_dev)));
if (!etr_data)
return -ENOMEM;
num_banks = GET_MAX_BANKS(accel_dev);
size = num_banks * sizeof(struct adf_etr_bank_data);
- etr_data->banks = kzalloc_node(size, GFP_KERNEL, accel_dev->numa_node);
+ etr_data->banks = kzalloc_node(size, GFP_KERNEL,
+ dev_to_node(&GET_DEV(accel_dev)));
if (!etr_data->banks) {
ret = -ENOMEM;
goto err_bank;
diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index bffa8bf..0897a1c 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -598,7 +598,8 @@ static int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst,
if (unlikely(!n))
return -EINVAL;
- bufl = kmalloc_node(sz, GFP_ATOMIC, inst->accel_dev->numa_node);
+ bufl = kmalloc_node(sz, GFP_ATOMIC,
+ dev_to_node(&GET_DEV(inst->accel_dev)));
if (unlikely(!bufl))
return -ENOMEM;
@@ -644,7 +645,7 @@ static int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst,
struct qat_alg_buf *bufers;
buflout = kmalloc_node(sz, GFP_ATOMIC,
- inst->accel_dev->numa_node);
+ dev_to_node(&GET_DEV(inst->accel_dev)));
if (unlikely(!buflout))
goto err;
bloutp = dma_map_single(dev, buflout, sz, DMA_TO_DEVICE);
diff --git a/drivers/crypto/qat/qat_common/qat_crypto.c b/drivers/crypto/qat/qat_common/qat_crypto.c
index 060dc0a..c1eefc4 100644
--- a/drivers/crypto/qat/qat_common/qat_crypto.c
+++ b/drivers/crypto/qat/qat_common/qat_crypto.c
@@ -109,12 +109,14 @@ struct qat_crypto_instance *qat_crypto_get_instance_node(int node)
list_for_each(itr, adf_devmgr_get_head()) {
accel_dev = list_entry(itr, struct adf_accel_dev, list);
- if (accel_dev->numa_node == node && adf_dev_started(accel_dev))
+ if ((node == dev_to_node(&GET_DEV(accel_dev)) ||
+ dev_to_node(&GET_DEV(accel_dev)) < 0)
+ && adf_dev_started(accel_dev))
break;
accel_dev = NULL;
}
if (!accel_dev) {
- pr_err("QAT: Could not find a device on the given node\n");
+ pr_err("QAT: Could not find a device on node %d\n", node);
accel_dev = adf_devmgr_get_first();
}
if (!accel_dev || !adf_dev_started(accel_dev))
@@ -164,7 +166,7 @@ static int qat_crypto_create_instances(struct adf_accel_dev *accel_dev)
for (i = 0; i < num_inst; i++) {
inst = kzalloc_node(sizeof(*inst), GFP_KERNEL,
- accel_dev->numa_node);
+ dev_to_node(&GET_DEV(accel_dev)));
if (!inst)
goto err;
diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_admin.c b/drivers/crypto/qat/qat_dh895xcc/adf_admin.c
index 978d6c5..53c491b 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_admin.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_admin.c
@@ -108,7 +108,7 @@ int adf_init_admin_comms(struct adf_accel_dev *accel_dev)
uint64_t reg_val;
admin = kzalloc_node(sizeof(*accel_dev->admin), GFP_KERNEL,
- accel_dev->numa_node);
+ dev_to_node(&GET_DEV(accel_dev)));
if (!admin)
return -ENOMEM;
admin->virt_addr = dma_zalloc_coherent(&GET_DEV(accel_dev), PAGE_SIZE,
diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
index 84edf17..40202cc 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
@@ -244,11 +244,18 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
}
node = adf_get_dev_node_id(pdev);
^^^ I don't think you should ever make this call. IMO it is wrong to do it that
way. Just stick with

node = dev_to_node(&pdev->dev)

as the line below forces a default to that anyway.
Post by Tadeusz Struk
+ if (node != dev_to_node(&pdev->dev) && dev_to_node(&pdev->dev) > 0) {
+ /* If the accelerator is connected to a node with no memory
+ * there is no point in using the accelerator since the remote
+ * memory transaction will be very slow. */
+ dev_err(&pdev->dev, "Invalid NUMA configuration.\n");
+ return -EINVAL;
Hmm ... I wonder if it would be safe to do

/* force allocations to node 0 */
node = 0;
dev_err(&pdev->dev, "Invalid NUMA configuration detected, node id = %d .
Defaulting node to 0. \n",
node);

and then continue?

And maybe even a FW_WARN of some sort here might be appropriate to indicate that
something is wrong with the mapping? In any case a better error message is a
always good idea IMO.

P.
Post by Tadeusz Struk
+ }
+
accel_dev = kzalloc_node(sizeof(*accel_dev), GFP_KERNEL, node);
if (!accel_dev)
return -ENOMEM;
- accel_dev->numa_node = node;
INIT_LIST_HEAD(&accel_dev->crypto_list);
/* Add accel device to accel table.
diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_isr.c b/drivers/crypto/qat/qat_dh895xcc/adf_isr.c
index c9212b9..fe8f896 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_isr.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_isr.c
@@ -168,7 +168,7 @@ static int adf_isr_alloc_msix_entry_table(struct adf_accel_dev *accel_dev)
uint32_t msix_num_entries = hw_data->num_banks + 1;
entries = kzalloc_node(msix_num_entries * sizeof(*entries),
- GFP_KERNEL, accel_dev->numa_node);
+ GFP_KERNEL, dev_to_node(&GET_DEV(accel_dev)));
if (!entries)
return -ENOMEM;
Tadeusz Struk
2014-10-08 18:11:39 UTC
Permalink
Post by Prarit Bhargava
Post by Tadeusz Struk
node = adf_get_dev_node_id(pdev);
^^^ I don't think you should ever make this call. IMO it is wrong to do it that
way. Just stick with
node = dev_to_node(&pdev->dev)
as the line below forces a default to that anyway.
But then how do I know which node I'm physically connected to?
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
+ if (node != dev_to_node(&pdev->dev) && dev_to_node(&pdev->dev) > 0) {
+ /* If the accelerator is connected to a node with no memory
+ * there is no point in using the accelerator since the remote
+ * memory transaction will be very slow. */
+ dev_err(&pdev->dev, "Invalid NUMA configuration.\n");
+ return -EINVAL;
Hmm ... I wonder if it would be safe to do
/* force allocations to node 0 */
node = 0;
dev_err(&pdev->dev, "Invalid NUMA configuration detected, node id = %d .
Defaulting node to 0. \n",
node);
and then continue?
As the comment say there is no point continuing if the configuration is
wrong. Defaulting to 0 will cause the same panic you pointed out in your
first patch if node 0 has no memory.
Post by Prarit Bhargava
And maybe even a FW_WARN of some sort here might be appropriate to indicate that
something is wrong with the mapping? In any case a better error message is a
always good idea IMO.
Prarit Bhargava
2014-10-08 18:35:44 UTC
Permalink
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
node = adf_get_dev_node_id(pdev);
^^^ I don't think you should ever make this call. IMO it is wrong to do it that
way. Just stick with
node = dev_to_node(&pdev->dev)
as the line below forces a default to that anyway.
But then how do I know which node I'm physically connected to?
The pci_dev maps to the bus which maps to a numa node. The pci_dev's numa value
is copied directly from the bus (or busses depending on how deep it is).

I'd argue (strongly) that the pci_dev's numa ID better be correct o/w that is a
FW bug (and a bad one at that these days).

dev_to_node() should return the correct value.
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
+ if (node != dev_to_node(&pdev->dev) && dev_to_node(&pdev->dev) > 0) {
+ /* If the accelerator is connected to a node with no memory
+ * there is no point in using the accelerator since the remote
+ * memory transaction will be very slow. */
+ dev_err(&pdev->dev, "Invalid NUMA configuration.\n");
+ return -EINVAL;
Hmm ... I wonder if it would be safe to do
/* force allocations to node 0 */
node = 0;
dev_err(&pdev->dev, "Invalid NUMA configuration detected, node id = %d .
Defaulting node to 0. \n",
node);
and then continue?
As the comment say there is no point continuing if the configuration is
wrong. Defaulting to 0 will cause the same panic you pointed out in your
first patch if node 0 has no memory.
Okay, but at least fix up the warning message to output the node_id. That's
sort of the important piece here.

P.
Post by Tadeusz Struk
Post by Prarit Bhargava
And maybe even a FW_WARN of some sort here might be appropriate to indicate that
something is wrong with the mapping? In any case a better error message is a
always good idea IMO.
Tadeusz Struk
2014-10-08 18:57:28 UTC
Permalink
Post by Prarit Bhargava
Post by Tadeusz Struk
But then how do I know which node I'm physically connected to?
The pci_dev maps to the bus which maps to a numa node. The pci_dev's numa value
is copied directly from the bus (or busses depending on how deep it is).
I'd argue (strongly) that the pci_dev's numa ID better be correct o/w that is a
FW bug (and a bad one at that these days).
dev_to_node() should return the correct value.
I'm not saying that the dev_to_node() returns incorrect value. It will
always return the closest numa node for the given device.
What we want to enforce is that the closest numa node is the node that
the device is physically connected to. In case if the closest numa node
is the remote node we don't want to use this accelerator.
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
+ if (node != dev_to_node(&pdev->dev) && dev_to_node(&pdev->dev) > 0) {
+ /* If the accelerator is connected to a node with no memory
+ * there is no point in using the accelerator since the remote
+ * memory transaction will be very slow. */
+ dev_err(&pdev->dev, "Invalid NUMA configuration.\n");
+ return -EINVAL;
Hmm ... I wonder if it would be safe to do
/* force allocations to node 0 */
node = 0;
dev_err(&pdev->dev, "Invalid NUMA configuration detected, node id = %d .
Defaulting node to 0. \n",
node);
and then continue?
As the comment say there is no point continuing if the configuration is
wrong. Defaulting to 0 will cause the same panic you pointed out in your
first patch if node 0 has no memory.
Okay, but at least fix up the warning message to output the node_id. That's
sort of the important piece here.
Sure, I can update the message add print both the numa node and node id
the device is connected to.
Prarit Bhargava
2014-10-08 19:01:26 UTC
Permalink
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
But then how do I know which node I'm physically connected to?
The pci_dev maps to the bus which maps to a numa node. The pci_dev's numa value
is copied directly from the bus (or busses depending on how deep it is).
I'd argue (strongly) that the pci_dev's numa ID better be correct o/w that is a
FW bug (and a bad one at that these days).
dev_to_node() should return the correct value.
I'm not saying that the dev_to_node() returns incorrect value. It will
always return the closest numa node for the given device.
No that isn't correct. dev_to_node() will return the node the device is
connected to.
Post by Tadeusz Struk
What we want to enforce is that the closest numa node is the node that
the device is physically connected to. In case if the closest numa node
is the remote node we don't want to use this accelerator.
P.
Tadeusz Struk
2014-10-08 19:25:58 UTC
Permalink
Post by Prarit Bhargava
No that isn't correct. dev_to_node() will return the node the device is
connected to.
include/linux/device.h:

static inline int dev_to_node(struct device *dev)
{
return dev->numa_node;
}

struct device {
.....
int numa_node; /* NUMA node this device is close to */
...
};

In case when there are two nodes and only node 0 has memory,
dev->numa_node will be 0 even though the device will be connected to the
pci root port of node 1.
Prarit Bhargava
2014-10-09 11:23:02 UTC
Permalink
[sorry ... accidentally hit reply instead of reply all ... resending to everyone]
Post by Tadeusz Struk
Post by Prarit Bhargava
No that isn't correct. dev_to_node() will return the node the device is
connected to.
static inline int dev_to_node(struct device *dev)
{
return dev->numa_node;
}
struct device {
.....
int numa_node; /* NUMA node this device is close to */
...
That's just bad english. The numa node value (for pci devices) is
read from the ACPI tables on the system and represents the node that
the pci_dev is connected to.
Post by Tadeusz Struk
};
In case when there are two nodes and only node 0 has memory,
dev->numa_node will be 0 even though the device will be connected to the
pci root port of node 1.
Your calculation completely falls apart and returns incorrect values when
cpu hotplug is used or if there are multi-socket nodes (as was the case
on the system that panicked), or if one uses the new cluster-on-die mode.

P.
Tadeusz Struk
2014-10-09 16:14:17 UTC
Permalink
Post by Prarit Bhargava
Post by Tadeusz Struk
int numa_node; /* NUMA node this device is close to */
Post by Tadeusz Struk
...
That's just bad english. The numa node value (for pci devices) is
read from the ACPI tables on the system and represents the node that
the pci_dev is connected to.
Post by Tadeusz Struk
Post by Tadeusz Struk
};
In case when there are two nodes and only node 0 has memory,
dev->numa_node will be 0 even though the device will be connected to the
pci root port of node 1.
Your calculation completely falls apart and returns incorrect values when
cpu hotplug is used or if there are multi-socket nodes (as was the case
on the system that panicked), or if one uses the new cluster-on-die mode.
This calculation is sole for multi-socket configuration. This is why is
was introduced and what it was tested for.
There is no point discussing NUMA for single-socket configuration.
Single socket configurations are not NUMA. In this case dev->numa_node
is usually equal to NUMA_NO_NODE (-1) and adf_get_dev_node_id(pdev) will
always return 0;
Please confirm that, but I think the system it panicked on was a two
sockets system with only node 0 populated with memory and accelerator
plugged it to node 1 (phys_proc_id == 1).
In this case adf_get_dev_node_id(pdev) returned 1 and this was passed to
kzalloc_node(size, GFP_KERNEL, 1) and because there was no memory on
node 1 kzalloc_node() panicked.
This patch will make sure that this will not happen and that the
configuration will be optimal.
Prarit Bhargava
2014-10-09 17:32:42 UTC
Permalink
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
int numa_node; /* NUMA node this device is close to */
Post by Tadeusz Struk
...
That's just bad english. The numa node value (for pci devices) is
read from the ACPI tables on the system and represents the node that
the pci_dev is connected to.
Post by Tadeusz Struk
Post by Tadeusz Struk
};
In case when there are two nodes and only node 0 has memory,
dev->numa_node will be 0 even though the device will be connected to the
pci root port of node 1.
Your calculation completely falls apart and returns incorrect values when
cpu hotplug is used or if there are multi-socket nodes (as was the case
on the system that panicked), or if one uses the new cluster-on-die mode.
This calculation is sole for multi-socket configuration. This is why is
was introduced and what it was tested for.
There is no point discussing NUMA for single-socket configuration.
Single socket configurations are not NUMA. In this case dev->numa_node
is usually equal to NUMA_NO_NODE (-1) and adf_get_dev_node_id(pdev) will
always return 0;
The fact that you return an incorrect value here for any configuration is simply
put, bad. You shouldn't do that.
Post by Tadeusz Struk
Please confirm that, but I think the system it panicked on was a two
sockets system with only node 0 populated with memory and accelerator
plugged it to node 1 (phys_proc_id == 1).
In this case adf_get_dev_node_id(pdev) returned 1 and this was passed to
kzalloc_node(size, GFP_KERNEL, 1) and because there was no memory on
node 1 kzalloc_node() panicked.
Yep; but my interpretation was that node 1 didn't exist at all and it panicked.
Post by Tadeusz Struk
This patch will make sure that this will not happen and that the
configuration will be optimal.
Yep, it will. But what about cpu hotplug?

P.
Tadeusz Struk
2014-10-09 19:55:04 UTC
Permalink
Post by Prarit Bhargava
Post by Tadeusz Struk
This calculation is sole for multi-socket configuration. This is why is
Post by Tadeusz Struk
was introduced and what it was tested for.
There is no point discussing NUMA for single-socket configuration.
Single socket configurations are not NUMA. In this case dev->numa_node
is usually equal to NUMA_NO_NODE (-1) and adf_get_dev_node_id(pdev) will
always return 0;
The fact that you return an incorrect value here for any configuration is simply
put, bad. You shouldn't do that.
Well I wouldn't say it is incorrect. adf_get_dev_node_id() returns the
phys proc id the dev is connected to, so in single socket configuration
there is only one socket 0.
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
Please confirm that, but I think the system it panicked on was a two
sockets system with only node 0 populated with memory and accelerator
plugged it to node 1 (phys_proc_id == 1).
In this case adf_get_dev_node_id(pdev) returned 1 and this was passed to
kzalloc_node(size, GFP_KERNEL, 1) and because there was no memory on
node 1 kzalloc_node() panicked.
Yep; but my interpretation was that node 1 didn't exist at all and it panicked.
Why didn't exist? The fact that there was no memory on node 1 doesn't
make it disappear. There are two sockets in the platform 0 & 1 even
though only one (node 0) has memory. The only problem with that is - it
is far from optimal if device connected to node 1 uses memory on node 0.
And this is what would happen if we would use dev_to_node(dev) here.
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
This patch will make sure that this will not happen and that the
configuration will be optimal.
Yep, it will. But what about cpu hotplug?
I don't think cpu hotplug matters here. This is one (probe) time
determination if the configuration is optimal or not and if it makes
sense to use this accelerator or not.
Prarit Bhargava
2014-10-09 21:42:59 UTC
Permalink
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
This calculation is sole for multi-socket configuration. This is why is
Post by Tadeusz Struk
was introduced and what it was tested for.
There is no point discussing NUMA for single-socket configuration.
Single socket configurations are not NUMA. In this case dev->numa_node
is usually equal to NUMA_NO_NODE (-1) and adf_get_dev_node_id(pdev) will
always return 0;
The fact that you return an incorrect value here for any configuration is simply
put, bad. You shouldn't do that.
Well I wouldn't say it is incorrect. adf_get_dev_node_id() returns the
phys proc id the dev is connected to, so in single socket configuration
there is only one socket 0.
That's not entirely true -- see my previous comment about
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
Please confirm that, but I think the system it panicked on was a two
sockets system with only node 0 populated with memory and accelerator
plugged it to node 1 (phys_proc_id == 1).
In this case adf_get_dev_node_id(pdev) returned 1 and this was passed to
kzalloc_node(size, GFP_KERNEL, 1) and because there was no memory on
node 1 kzalloc_node() panicked.
Yep; but my interpretation was that node 1 didn't exist at all and it panicked.
Why didn't exist?
The reason the kernel panics is because there is only node 0. Try allocating,
for example, from node 100. You'll hit the same panic. That's different than
saying node 0 has no memory ... I think we agree on this FWIW ... we just have
slightly different interpretations of the panic :)

The fact that there was no memory on node 1 doesn't
Post by Tadeusz Struk
make it disappear. There are two sockets in the platform 0 & 1 even
though only one (node 0) has memory. The only problem with that is - it
is far from optimal if device connected to node 1 uses memory on node 0.
And this is what would happen if we would use dev_to_node(dev) here.
Post by Prarit Bhargava
Post by Tadeusz Struk
Post by Tadeusz Struk
This patch will make sure that this will not happen and that the
configuration will be optimal.
Yep, it will. But what about cpu hotplug?
I don't think cpu hotplug matters here. This is one (probe) time
determination if the configuration is optimal or not and if it makes
sense to use this accelerator or not.
It absolutely matters. num_online_cpus() *changes* depending on the # of cpus.

P.
Tadeusz Struk
2014-10-09 23:12:30 UTC
Permalink
Post by Prarit Bhargava
Post by Tadeusz Struk
I don't think cpu hotplug matters here. This is one (probe) time
determination if the configuration is optimal or not and if it makes
sense to use this accelerator or not.
It absolutely matters. num_online_cpus() *changes* depending on the # of cpus.
Sure, but I still think that we are safe here.
Prarit Bhargava
2014-10-10 11:23:53 UTC
Permalink
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
I don't think cpu hotplug matters here. This is one (probe) time
determination if the configuration is optimal or not and if it makes
sense to use this accelerator or not.
It absolutely matters. num_online_cpus() *changes* depending on the # of cpus.
Sure, but I still think that we are safe here.
No, you're not. Dropping a single CPU changes num_online_cpus(), which results in

static uint8_t adf_get_dev_node_id(struct pci_dev *pdev)
{
unsigned int bus_per_cpu = 0;
struct cpuinfo_x86 *c = &cpu_data(num_online_cpus() - 1); <<< this
being different.

if (!c->phys_proc_id)
return 0;

bus_per_cpu = 256 / (c->phys_proc_id + 1); <<< this being different

if (bus_per_cpu != 0)
return pdev->bus->number / bus_per_cpu; <<< and this being different
return 0;
}

P.
Tadeusz Struk
2014-10-10 13:25:03 UTC
Permalink
Post by Prarit Bhargava
Post by Tadeusz Struk
Sure, but I still think that we are safe here.
No, you're not. Dropping a single CPU changes num_online_cpus(), which results in
static uint8_t adf_get_dev_node_id(struct pci_dev *pdev)
{
unsigned int bus_per_cpu = 0;
struct cpuinfo_x86 *c = &cpu_data(num_online_cpus() - 1); <<< this
being different.
if (!c->phys_proc_id)
return 0;
bus_per_cpu = 256 / (c->phys_proc_id + 1); <<< this being different
if (bus_per_cpu != 0)
return pdev->bus->number / bus_per_cpu; <<< and this being different
return 0;
}
You forgot to explain how this is not safe.
T.
Prarit Bhargava
2014-10-10 22:15:36 UTC
Permalink
Post by Tadeusz Struk
Post by Prarit Bhargava
Post by Tadeusz Struk
Sure, but I still think that we are safe here.
No, you're not. Dropping a single CPU changes num_online_cpus(), which results in
static uint8_t adf_get_dev_node_id(struct pci_dev *pdev)
{
unsigned int bus_per_cpu = 0;
struct cpuinfo_x86 *c = &cpu_data(num_online_cpus() - 1); <<< this
being different.
if (!c->phys_proc_id)
return 0;
bus_per_cpu = 256 / (c->phys_proc_id + 1); <<< this being different
if (bus_per_cpu != 0)
return pdev->bus->number / bus_per_cpu; <<< and this being different
return 0;
}
You forgot to explain how this is not safe.
Sorry, I thought I did explain it. My apologies.

So let's say you boot the system and load the driver. At this time,

***@boot = 4 . Crunch through the math above, and you reference the
cpuinfo_x86 struct for cpu 3 (the "fourth" cpu), and the calculation takes into
account c->phys_proc_id.

So let's say now you boot the system and disable a cpu. In this case, now
***@module_load = 3. Crunch through the math above and you're
referncing a different cpuinfo_x86 struct for cpu 2. That may or may not point
at the same c->phys_proc_id. That changes the calculation and gives an
incorrect value.

In addition to that I haven't even talked about the possibility of hot-adding
and hot-removing cpus in sockets which changes the numbering scheme completely.

In short, that calcuation is wrong. Don't use it; stick with the widely
accepted and used dev_to_node of the pci_dev. It is used in other cases IIRC to
determine the numa location of the device. It shouldn't be any different for
this driver.

P.
Post by Tadeusz Struk
T.
Tadeusz Struk
2014-10-11 17:05:40 UTC
Permalink
Post by Prarit Bhargava
In short, that calcuation is wrong. Don't use it; stick with the widely
accepted and used dev_to_node of the pci_dev. It is used in other cases IIRC to
determine the numa location of the device. It shouldn't be any different for
this driver.
As a matter of fact this is what we are doing now. Effectively
dev_to_node() is used for all numa allocations.
The only reason it was done this way was to ensure the best performance.
We are competing with very fast on core encryption here and the only way
we can beat that is to make sure the dma transactions will not have to
go over the QPI link to the remote numa node. If it will then the
performance will drop significantly and in this case we don't want to
bring the device up.
I agree that this is not the most elegant way to figure out the numa
locality for a device, but it worked for us so far.
Anyway I think I know how to get done this using ACPI. Will send v2 on
Monday.
T.

Tadeusz Struk
2014-10-08 17:38:47 UTC
Permalink
Do not attempt to dma map associated data if it is zero length.

Signed-off-by: Tadeusz Struk <***@intel.com>
---
drivers/crypto/qat/qat_common/qat_algs.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index e03afe9..bffa8bf 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -607,6 +607,8 @@ static int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst,
goto err;

for_each_sg(assoc, sg, assoc_n, i) {
+ if (!sg->length)
+ continue;
bufl->bufers[bufs].addr = dma_map_single(dev,
sg_virt(sg),
sg->length,
Loading...