From 4f51ba2add5553583f618e463ca92e6aff148680 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:01 -0500 Subject: [PATCH 1/9] RDMA/rxe: Fix mr->map double free jira KERNEL-325 cve CVE-2022-50543 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Li Zhijian commit 7d984dac8f6bf4ebd3398af82b357e1d181ecaac rxe_mr_cleanup() which tries to free mr->map again will be called when rxe_mr_init_user() fails: CPU: 0 PID: 4917 Comm: rdma_flush_serv Kdump: loaded Not tainted 6.1.0-rc1-roce-flush+ #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x5d panic+0x19e/0x349 end_report.part.0+0x54/0x7c kasan_report.cold+0xa/0xf rxe_mr_cleanup+0x9d/0xf0 [rdma_rxe] __rxe_cleanup+0x10a/0x1e0 [rdma_rxe] rxe_reg_user_mr+0xb7/0xd0 [rdma_rxe] ib_uverbs_reg_mr+0x26a/0x480 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x1a2/0x250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1397/0x15a0 [ib_uverbs] This issue was firstly exposed since commit b18c7da63fcb ("RDMA/rxe: Fix memory leak in error path code") and then we fixed it in commit 8ff5f5d9d8cf ("RDMA/rxe: Prevent double freeing rxe_map_set()") but this fix was reverted together at last by commit 1e75550648da (Revert "RDMA/rxe: Create duplicate mapping tables for FMRs") Simply let rxe_mr_cleanup() always handle freeing the mr->map once it is successfully allocated. Fixes: 1e75550648da ("Revert "RDMA/rxe: Create duplicate mapping tables for FMRs"") Link: https://lore.kernel.org/r/1667099073-2-1-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe (cherry picked from commit 7d984dac8f6bf4ebd3398af82b357e1d181ecaac) Signed-off-by: Jonathan Maple --- drivers/infiniband/sw/rxe/rxe_mr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 11a4776bb3857..928d031d96cd1 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -99,6 +99,7 @@ static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) kfree(mr->map[i]); kfree(mr->map); + mr->map = NULL; err1: return -ENOMEM; } @@ -123,7 +124,6 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, int num_buf; void *vaddr; int err; - int i; umem = ib_umem_get(udata, start, length, access); if (IS_ERR(umem)) { @@ -164,9 +164,8 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, pr_warn("%s: Unable to get virtual address\n", __func__); err = -ENOMEM; - goto err_cleanup_map; + goto err_release_umem; } - buf->addr = (uintptr_t)vaddr; buf->size = PAGE_SIZE; num_buf++; @@ -186,10 +185,6 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, return 0; -err_cleanup_map: - for (i = 0; i < mr->num_map; i++) - kfree(mr->map[i]); - kfree(mr->map); err_release_umem: ib_umem_release(umem); err_out: From 0b8b871f8acb73f59377a48eaee26ed39746a962 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:02 -0500 Subject: [PATCH 2/9] RDMA/rxe: Fix incomplete state save in rxe_requester jira KERNEL-325 cve CVE-2023-53539 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Bob Pearson commit 5d122db2ff80cd2aed4dcd630befb56b51ddf947 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/5d122db2.failed If a send packet is dropped by the IP layer in rxe_requester() the call to rxe_xmit_packet() can fail with err == -EAGAIN. To recover, the state of the wqe is restored to the state before the packet was sent so it can be resent. However, the routines that save and restore the state miss a significnt part of the variable state in the wqe, the dma struct which is used to process through the sge table. And, the state is not saved before the packet is built which modifies the dma struct. Under heavy stress testing with many QPs on a fast node sending large messages to a slow node dropped packets are observed and the resent packets are corrupted because the dma struct was not restored. This patch fixes this behavior and allows the test cases to succeed. Fixes: 3050b9985024 ("IB/rxe: Fix race condition between requester and completer") Link: https://lore.kernel.org/r/20230721200748.4604-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe (cherry picked from commit 5d122db2ff80cd2aed4dcd630befb56b51ddf947) Signed-off-by: Jonathan Maple # Conflicts: # drivers/infiniband/sw/rxe/rxe_req.c --- .../5d122db2.failed | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/5d122db2.failed diff --git a/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/5d122db2.failed b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/5d122db2.failed new file mode 100644 index 0000000000000..501f011d3f1ec --- /dev/null +++ b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/5d122db2.failed @@ -0,0 +1,90 @@ +RDMA/rxe: Fix incomplete state save in rxe_requester + +jira KERNEL-325 +cve CVE-2023-53539 +Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 +commit-author Bob Pearson +commit 5d122db2ff80cd2aed4dcd630befb56b51ddf947 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/5d122db2.failed + +If a send packet is dropped by the IP layer in rxe_requester() +the call to rxe_xmit_packet() can fail with err == -EAGAIN. +To recover, the state of the wqe is restored to the state before +the packet was sent so it can be resent. However, the routines +that save and restore the state miss a significnt part of the +variable state in the wqe, the dma struct which is used to process +through the sge table. And, the state is not saved before the packet +is built which modifies the dma struct. + +Under heavy stress testing with many QPs on a fast node sending +large messages to a slow node dropped packets are observed and +the resent packets are corrupted because the dma struct was not +restored. This patch fixes this behavior and allows the test cases +to succeed. + +Fixes: 3050b9985024 ("IB/rxe: Fix race condition between requester and completer") +Link: https://lore.kernel.org/r/20230721200748.4604-1-rpearsonhpe@gmail.com + Signed-off-by: Bob Pearson + Signed-off-by: Jason Gunthorpe +(cherry picked from commit 5d122db2ff80cd2aed4dcd630befb56b51ddf947) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/infiniband/sw/rxe/rxe_req.c +diff --cc drivers/infiniband/sw/rxe/rxe_req.c +index f63771207970,d8c41fd626a9..000000000000 +--- a/drivers/infiniband/sw/rxe/rxe_req.c ++++ b/drivers/infiniband/sw/rxe/rxe_req.c +@@@ -746,9 -799,12 +748,12 @@@ int rxe_requester(void *arg + pkt.mask = rxe_opcode[opcode].mask; + pkt.wqe = wqe; + ++ /* save wqe state before we build and send packet */ ++ save_state(wqe, qp, &rollback_wqe, &rollback_psn); ++ + av = rxe_get_av(&pkt, &ah); + if (unlikely(!av)) { + - rxe_dbg_qp(qp, "Failed no address vector\n"); + + pr_err("qp#%d Failed no address vector\n", qp_num(qp)); + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } +@@@ -790,17 -840,23 +789,33 @@@ + + err = rxe_xmit_packet(qp, &pkt, skb); + if (err) { +++<<<<<<< HEAD + + qp->need_req_skb = 1; + + + + rollback_state(wqe, qp, &rollback_wqe, rollback_psn); + + + + if (err == -EAGAIN) { + + rxe_run_task(&qp->req.task, 1); + + goto exit; +++======= ++ if (err != -EAGAIN) { ++ wqe->status = IB_WC_LOC_QP_OP_ERR; ++ goto err; +++>>>>>>> 5d122db2ff80 (RDMA/rxe: Fix incomplete state save in rxe_requester) + } + +- wqe->status = IB_WC_LOC_QP_OP_ERR; +- goto err; ++ /* the packet was dropped so reset wqe to the state ++ * before we sent it so we can try to resend ++ */ ++ rollback_state(wqe, qp, &rollback_wqe, rollback_psn); ++ ++ /* force a delay until the dropped packet is freed and ++ * the send queue is drained below the low water mark ++ */ ++ qp->need_req_skb = 1; ++ ++ rxe_sched_task(&qp->req.task); ++ goto exit; + } + + update_state(qp, &pkt); +* Unmerged path drivers/infiniband/sw/rxe/rxe_req.c From f1edd62a78df32c12772b0e4bfe5618253fe49c5 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:02 -0500 Subject: [PATCH 3/9] scsi: s390: zfcp: Ensure synchronous unit_add jira KERNEL-325 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Peter Oberparleiter commit 9697ca0d53e3db357be26d2414276143c4a2cd49 Improve the usability of the unit_add sysfs attribute by ensuring that the associated FCP LUN scan processing is completed synchronously. This enables configuration tooling to consistently determine the end of the scan process to allow for serialization of follow-on actions. While the scan process associated with unit_add typically completes synchronously, it is deferred to an asynchronous background process if unit_add is used before initial remote port scanning has completed. This occurs when unit_add is used immediately after setting the associated FCP device online. To ensure synchronous unit_add processing, wait for remote port scanning to complete before initiating the FCP LUN scan. Cc: stable@vger.kernel.org Reviewed-by: M Nikhil Reviewed-by: Nihar Panda Signed-off-by: Peter Oberparleiter Signed-off-by: Nihar Panda Link: https://lore.kernel.org/r/20250603182252.2287285-2-niharp@linux.ibm.com Signed-off-by: Martin K. Petersen (cherry picked from commit 9697ca0d53e3db357be26d2414276143c4a2cd49) Signed-off-by: Jonathan Maple --- drivers/s390/scsi/zfcp_sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c index b8cd75a872eeb..8ca46e4aa8d63 100644 --- a/drivers/s390/scsi/zfcp_sysfs.c +++ b/drivers/s390/scsi/zfcp_sysfs.c @@ -450,6 +450,8 @@ static ssize_t zfcp_sysfs_unit_add_store(struct device *dev, if (kstrtoull(buf, 0, (unsigned long long *) &fcp_lun)) return -EINVAL; + flush_work(&port->rport_work); + retval = zfcp_unit_add(port, fcp_lun); if (retval) return retval; From d0b9331e49e61ef33ab7c746d122940f89eecdb4 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:02 -0500 Subject: [PATCH 4/9] gfs2: do_xmote cleanup jira KERNEL-325 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Andreas Gruenbacher commit 2309a01351e56446f43c89e200d643647d47e739 Check for asynchronous completion and clear the GLF_PENDING_REPLY flag earlier in do_xmote(). This will make future changes more readable. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price (cherry picked from commit 2309a01351e56446f43c89e200d643647d47e739) Signed-off-by: Jonathan Maple --- fs/gfs2/glock.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index aa0255232985a..de182757764bc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -821,6 +821,12 @@ __acquires(&gl->gl_lockref.lock) ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); + if (!ret) { + /* The operation will be completed asynchronously. */ + return; + } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); + if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && target == LM_ST_UNLOCKED && test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { @@ -828,14 +834,10 @@ __acquires(&gl->gl_lockref.lock) * The lockspace has been released and the lock has * been unlocked implicitly. */ - } else if (ret) { + } else { fs_err(sdp, "lm_lock ret %d\n", ret); target = gl->gl_state | LM_OUT_ERROR; - } else { - /* The operation will be completed asynchronously. */ - return; } - clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ From 95c46c3a25b65eaf80e0e926d7a3d884d95ad7fd Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:03 -0500 Subject: [PATCH 5/9] gfs2: Add proper lockspace locking jira KERNEL-325 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Andreas Gruenbacher commit 6ab26555c9ffef96c56ca16356e55ac5ab61ec93 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/6ab26555.failed GFS2 has been calling functions like dlm_lock() even after the lockspace that these functions operate on has been released with dlm_release_lockspace(). It has always assumed that those functions would return -EINVAL in that case, but that was never guaranteed, and it certainly is no longer the case since commit 4db41bf4f04f ("dlm: remove ls_local_handle from struct dlm_ls"). To fix that, add proper lockspace locking. Fixes: 3e11e5304150 ("GFS2: ignore unlock failures after withdraw") Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price (cherry picked from commit 6ab26555c9ffef96c56ca16356e55ac5ab61ec93) Signed-off-by: Jonathan Maple # Conflicts: # fs/gfs2/file.c # fs/gfs2/lock_dlm.c --- .../6ab26555.failed | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/6ab26555.failed diff --git a/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/6ab26555.failed b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/6ab26555.failed new file mode 100644 index 0000000000000..bd20c597dd752 --- /dev/null +++ b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/6ab26555.failed @@ -0,0 +1,155 @@ +gfs2: Add proper lockspace locking + +jira KERNEL-325 +Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 +commit-author Andreas Gruenbacher +commit 6ab26555c9ffef96c56ca16356e55ac5ab61ec93 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/6ab26555.failed + +GFS2 has been calling functions like dlm_lock() even after the lockspace +that these functions operate on has been released with +dlm_release_lockspace(). It has always assumed that those functions +would return -EINVAL in that case, but that was never guaranteed, and it +certainly is no longer the case since commit 4db41bf4f04f ("dlm: remove +ls_local_handle from struct dlm_ls"). + +To fix that, add proper lockspace locking. + +Fixes: 3e11e5304150 ("GFS2: ignore unlock failures after withdraw") + Signed-off-by: Andreas Gruenbacher + Reviewed-by: Andrew Price +(cherry picked from commit 6ab26555c9ffef96c56ca16356e55ac5ab61ec93) + Signed-off-by: Jonathan Maple + +# Conflicts: +# fs/gfs2/file.c +# fs/gfs2/lock_dlm.c +diff --cc fs/gfs2/file.c +index dd07e0f60888,bc67fa058c84..000000000000 +--- a/fs/gfs2/file.c ++++ b/fs/gfs2/file.c +@@@ -1414,28 -1442,44 +1414,46 @@@ static int gfs2_lock(struct file *file + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; ++ int ret; + + - if (!(fl->c.flc_flags & FL_POSIX)) + + if (!(fl->fl_flags & FL_POSIX)) + + return -ENOLCK; + + if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + - if (gfs2_withdrawing_or_withdrawn(sdp)) { + - if (lock_is_unlock(fl)) + + + + if (cmd == F_CANCELLK) { + + /* Hack: */ + + cmd = F_SETLK; + + fl->fl_type = F_UNLCK; + + } + + if (unlikely(gfs2_withdrawn(sdp))) { + + if (fl->fl_type == F_UNLCK) + locks_lock_file_wait(file, fl); + return -EIO; + } +++<<<<<<< HEAD + + if (IS_GETLK(cmd)) + + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + + else if (fl->fl_type == F_UNLCK) + + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + + else + + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); +++======= ++ down_read(&ls->ls_sem); ++ ret = -ENODEV; ++ if (likely(ls->ls_dlm != NULL)) { ++ if (cmd == F_CANCELLK) ++ ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); ++ else if (IS_GETLK(cmd)) ++ ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); ++ else if (lock_is_unlock(fl)) ++ ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); ++ else ++ ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); ++ } ++ up_read(&ls->ls_sem); ++ return ret; + -} + - + -static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) + -{ + - struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl); + - + - /* + - * Make sure gfs2_glock_put() won't sleep under the file->f_lock + - * spinlock. + - */ + - + - spin_lock(&file->f_lock); + - gfs2_holder_uninit(fl_gh); + - spin_unlock(&file->f_lock); + - gfs2_glock_put(gl); +++>>>>>>> 6ab26555c9ff (gfs2: Add proper lockspace locking) + } + + static int do_flock(struct file *file, int cmd, struct file_lock *fl) +diff --cc fs/gfs2/lock_dlm.c +index 0579fdbc9c63,3c7db20ce564..000000000000 +--- a/fs/gfs2/lock_dlm.c ++++ b/fs/gfs2/lock_dlm.c +@@@ -334,9 -363,17 +339,19 @@@ static void gdlm_put_lock(struct gfs2_g + return; + } + + - if (gl->gl_lksb.sb_lvbptr) + - flags |= DLM_LKF_VALBLK; + - + again: +++<<<<<<< HEAD + + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + + NULL, gl); +++======= ++ down_read(&ls->ls_sem); ++ error = -ENODEV; ++ if (likely(ls->ls_dlm != NULL)) { ++ error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, ++ NULL, gl); ++ } ++ up_read(&ls->ls_sem); +++>>>>>>> 6ab26555c9ff (gfs2: Add proper lockspace locking) + if (error == -EBUSY) { + msleep(20); + goto again; +* Unmerged path fs/gfs2/file.c +diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c +index de182757764b..8581006a1023 100644 +--- a/fs/gfs2/glock.c ++++ b/fs/gfs2/glock.c +@@ -827,9 +827,8 @@ __acquires(&gl->gl_lockref.lock) + } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); + +- if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && +- target == LM_ST_UNLOCKED && +- test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { ++ if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED && ++ target == LM_ST_UNLOCKED) { + /* + * The lockspace has been released and the lock has + * been unlocked implicitly. +diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h +index 6eca783f0691..36018c48b8f2 100644 +--- a/fs/gfs2/incore.h ++++ b/fs/gfs2/incore.h +@@ -665,6 +665,8 @@ struct lm_lockstruct { + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; + ++ struct rw_semaphore ls_sem; ++ + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ +* Unmerged path fs/gfs2/lock_dlm.c From 6c07643bbe3d3c48bd1962ad334d7dfd61f7f4a9 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:03 -0500 Subject: [PATCH 6/9] mm/memcg: revert ("mm/memcg: optimize user context object stock access") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira KERNEL-325 cve CVE-2023-53401 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Michal Hocko commit fead2b869764f89d524b79dc8862e61d5191be55 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/fead2b86.failed Patch series "mm/memcg: Address PREEMPT_RT problems instead of disabling it", v5. This series aims to address the memcg related problem on PREEMPT_RT. I tested them on CONFIG_PREEMPT and CONFIG_PREEMPT_RT with the tools/testing/selftests/cgroup/* tests and I haven't observed any regressions (other than the lockdep report that is already there). This patch (of 6): The optimisation is based on a micro benchmark where local_irq_save() is more expensive than a preempt_disable(). There is no evidence that it is visible in a real-world workload and there are CPUs where the opposite is true (local_irq_save() is cheaper than preempt_disable()). Based on micro benchmarks, the optimisation makes sense on PREEMPT_NONE where preempt_disable() is optimized away. There is no improvement with PREEMPT_DYNAMIC since the preemption counter is always available. The optimization makes also the PREEMPT_RT integration more complicated since most of the assumption are not true on PREEMPT_RT. Revert the optimisation since it complicates the PREEMPT_RT integration and the improvement is hardly visible. [bigeasy@linutronix.de: patch body around Michal's diff] Link: https://lkml.kernel.org/r/20220226204144.1008339-1-bigeasy@linutronix.de Link: https://lore.kernel.org/all/YgOGkXXCrD%2F1k+p4@dhcp22.suse.cz Link: https://lkml.kernel.org/r/YdX+INO9gQje6d0S@linutronix.de Link: https://lkml.kernel.org/r/20220226204144.1008339-2-bigeasy@linutronix.de Signed-off-by: Michal Hocko Signed-off-by: Sebastian Andrzej Siewior Acked-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Waiman Long Cc: kernel test robot Cc: Michal Hocko Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit fead2b869764f89d524b79dc8862e61d5191be55) Signed-off-by: Jonathan Maple # Conflicts: # mm/memcontrol.c --- .../fead2b86.failed | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/fead2b86.failed diff --git a/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/fead2b86.failed b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/fead2b86.failed new file mode 100644 index 0000000000000..9bc5e719fb4a7 --- /dev/null +++ b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/fead2b86.failed @@ -0,0 +1,167 @@ +mm/memcg: revert ("mm/memcg: optimize user context object stock access") + +jira KERNEL-325 +cve CVE-2023-53401 +Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 +commit-author Michal Hocko +commit fead2b869764f89d524b79dc8862e61d5191be55 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/fead2b86.failed + +Patch series "mm/memcg: Address PREEMPT_RT problems instead of disabling it", v5. + +This series aims to address the memcg related problem on PREEMPT_RT. + +I tested them on CONFIG_PREEMPT and CONFIG_PREEMPT_RT with the +tools/testing/selftests/cgroup/* tests and I haven't observed any +regressions (other than the lockdep report that is already there). + +This patch (of 6): + +The optimisation is based on a micro benchmark where local_irq_save() is +more expensive than a preempt_disable(). There is no evidence that it +is visible in a real-world workload and there are CPUs where the +opposite is true (local_irq_save() is cheaper than preempt_disable()). + +Based on micro benchmarks, the optimisation makes sense on PREEMPT_NONE +where preempt_disable() is optimized away. There is no improvement with +PREEMPT_DYNAMIC since the preemption counter is always available. + +The optimization makes also the PREEMPT_RT integration more complicated +since most of the assumption are not true on PREEMPT_RT. + +Revert the optimisation since it complicates the PREEMPT_RT integration +and the improvement is hardly visible. + +[bigeasy@linutronix.de: patch body around Michal's diff] + +Link: https://lkml.kernel.org/r/20220226204144.1008339-1-bigeasy@linutronix.de +Link: https://lore.kernel.org/all/YgOGkXXCrD%2F1k+p4@dhcp22.suse.cz +Link: https://lkml.kernel.org/r/YdX+INO9gQje6d0S@linutronix.de +Link: https://lkml.kernel.org/r/20220226204144.1008339-2-bigeasy@linutronix.de + Signed-off-by: Michal Hocko + Signed-off-by: Sebastian Andrzej Siewior + Acked-by: Roman Gushchin + Acked-by: Johannes Weiner + Reviewed-by: Shakeel Butt + Acked-by: Michal Hocko + Cc: Johannes Weiner + Cc: Peter Zijlstra + Cc: Thomas Gleixner + Cc: Vladimir Davydov + Cc: Waiman Long + Cc: kernel test robot + Cc: Michal Hocko + Cc: Michal Koutný + Signed-off-by: Andrew Morton + Signed-off-by: Linus Torvalds +(cherry picked from commit fead2b869764f89d524b79dc8862e61d5191be55) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/memcontrol.c +diff --cc mm/memcontrol.c +index 6e2a077af4c1,7bf204b2b053..000000000000 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@@ -2232,18 -2061,27 +2232,21 @@@ static void __unlock_page_memcg(struct + } + + /** + - * folio_memcg_unlock - Release the binding between a folio and its memcg. + - * @folio: The folio. + - * + - * This releases the binding created by folio_memcg_lock(). This does + - * not change the accounting of this folio to its memcg, but it does + - * permit others to change it. + + * unlock_page_memcg - unlock a page and memcg binding + + * @page: the page + */ + -void folio_memcg_unlock(struct folio *folio) + -{ + - __folio_memcg_unlock(folio_memcg(folio)); + -} + - + void unlock_page_memcg(struct page *page) + { + - folio_memcg_unlock(page_folio(page)); + + struct page *head = compound_head(page); + + + + __unlock_page_memcg(page_memcg(head)); + } + +EXPORT_SYMBOL(unlock_page_memcg); + +- struct obj_stock { ++ struct memcg_stock_pcp { ++ struct mem_cgroup *cached; /* this never be root cgroup */ ++ unsigned int nr_pages; ++ + #ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cached_objcg; + struct pglist_data *cached_pgdat; +@@@ -2269,12 -2098,13 +2263,12 @@@ static DEFINE_PER_CPU(struct memcg_stoc + static DEFINE_MUTEX(percpu_charge_mutex); + + #ifdef CONFIG_MEMCG_KMEM +- static void drain_obj_stock(struct obj_stock *stock); ++ static void drain_obj_stock(struct memcg_stock_pcp *stock); + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg); + -static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); + + #else +- static inline void drain_obj_stock(struct obj_stock *stock) ++ static inline void drain_obj_stock(struct memcg_stock_pcp *stock) + { + } + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +@@@ -7219,22 -6782,21 +7180,30 @@@ static void uncharge_batch(const struc + css_put(&ug->memcg->css); + } + + -static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) + +static void uncharge_page(struct page *page, struct uncharge_gather *ug) + { + - long nr_pages; + + unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; +++<<<<<<< HEAD + + bool use_objcg = PageMemcgKmem(page); +++======= +++>>>>>>> fead2b869764 (mm/memcg: revert ("mm/memcg: optimize user context object stock access")) + + - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + VM_BUG_ON_PAGE(PageLRU(page), page); + + /* + * Nobody should be changing or seriously looking at + - * folio memcg or objcg at this point, we have fully + - * exclusive access to the folio. + + * page memcg or objcg at this point, we have fully + + * exclusive access to the page. + */ +++<<<<<<< HEAD + + if (use_objcg) { + + objcg = __page_objcg(page); +++======= ++ if (folio_memcg_kmem(folio)) { ++ objcg = __folio_objcg(folio); +++>>>>>>> fead2b869764 (mm/memcg: revert ("mm/memcg: optimize user context object stock access")) + /* + * This get matches the put at the end of the function and + * kmem pages do not hold memcg references anymore. +@@@ -7259,9 -6821,9 +7228,9 @@@ + css_get(&memcg->css); + } + + - nr_pages = folio_nr_pages(folio); + + nr_pages = compound_nr(page); + +- if (use_objcg) { ++ if (folio_memcg_kmem(folio)) { + ug->nr_memory += nr_pages; + ug->nr_kmem += nr_pages; + +* Unmerged path mm/memcontrol.c From 86dfd781e956005792fd1e0297c6693879bcf779 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:03 -0500 Subject: [PATCH 7/9] mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required() jira KERNEL-325 cve CVE-2023-53401 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Roman Gushchin commit 3b8abb3239530c423c0b97e42af7f7e856e1ee96 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/3b8abb32.failed KCSAN found an issue in obj_stock_flush_required(): stock->cached_objcg can be reset between the check and dereference: ================================================================== BUG: KCSAN: data-race in drain_all_stock / drain_obj_stock write to 0xffff888237c2a2f8 of 8 bytes by task 19625 on cpu 0: drain_obj_stock+0x408/0x4e0 mm/memcontrol.c:3306 refill_obj_stock+0x9c/0x1e0 mm/memcontrol.c:3340 obj_cgroup_uncharge+0xe/0x10 mm/memcontrol.c:3408 memcg_slab_free_hook mm/slab.h:587 [inline] __cache_free mm/slab.c:3373 [inline] __do_kmem_cache_free mm/slab.c:3577 [inline] kmem_cache_free+0x105/0x280 mm/slab.c:3602 __d_free fs/dcache.c:298 [inline] dentry_free fs/dcache.c:375 [inline] __dentry_kill+0x422/0x4a0 fs/dcache.c:621 dentry_kill+0x8d/0x1e0 dput+0x118/0x1f0 fs/dcache.c:913 __fput+0x3bf/0x570 fs/file_table.c:329 ____fput+0x15/0x20 fs/file_table.c:349 task_work_run+0x123/0x160 kernel/task_work.c:179 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] exit_to_user_mode_loop+0xcf/0xe0 kernel/entry/common.c:171 exit_to_user_mode_prepare+0x6a/0xa0 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x26/0x140 kernel/entry/common.c:296 do_syscall_64+0x4d/0xc0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888237c2a2f8 of 8 bytes by task 19632 on cpu 1: obj_stock_flush_required mm/memcontrol.c:3319 [inline] drain_all_stock+0x174/0x2a0 mm/memcontrol.c:2361 try_charge_memcg+0x6d0/0xd10 mm/memcontrol.c:2703 try_charge mm/memcontrol.c:2837 [inline] mem_cgroup_charge_skmem+0x51/0x140 mm/memcontrol.c:7290 sock_reserve_memory+0xb1/0x390 net/core/sock.c:1025 sk_setsockopt+0x800/0x1e70 net/core/sock.c:1525 udp_lib_setsockopt+0x99/0x6c0 net/ipv4/udp.c:2692 udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2817 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3668 __sys_setsockopt+0x1c3/0x230 net/socket.c:2271 __do_sys_setsockopt net/socket.c:2282 [inline] __se_sys_setsockopt net/socket.c:2279 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2279 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0xffff8881382d52c0 -> 0xffff888138893740 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 19632 Comm: syz-executor.0 Not tainted 6.3.0-rc2-syzkaller-00387-g534293368afa #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 Fix it by using READ_ONCE()/WRITE_ONCE() for all accesses to stock->cached_objcg. Link: https://lkml.kernel.org/r/20230502160839.361544-1-roman.gushchin@linux.dev Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") Signed-off-by: Roman Gushchin Reported-by: syzbot+774c29891415ab0fd29d@syzkaller.appspotmail.com Reported-by: Dmitry Vyukov Link: https://lore.kernel.org/linux-mm/CACT4Y+ZfucZhM60YPphWiCLJr6+SGFhT+jjm8k1P-a_8Kkxsjg@mail.gmail.com/T/#t Reviewed-by: Yosry Ahmed Acked-by: Shakeel Butt Reviewed-by: Dmitry Vyukov Signed-off-by: Andrew Morton (cherry picked from commit 3b8abb3239530c423c0b97e42af7f7e856e1ee96) Signed-off-by: Jonathan Maple # Conflicts: # mm/memcontrol.c --- .../3b8abb32.failed | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/3b8abb32.failed diff --git a/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/3b8abb32.failed b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/3b8abb32.failed new file mode 100644 index 0000000000000..f88bd657e0d24 --- /dev/null +++ b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/3b8abb32.failed @@ -0,0 +1,203 @@ +mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required() + +jira KERNEL-325 +cve CVE-2023-53401 +Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 +commit-author Roman Gushchin +commit 3b8abb3239530c423c0b97e42af7f7e856e1ee96 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/3b8abb32.failed + +KCSAN found an issue in obj_stock_flush_required(): +stock->cached_objcg can be reset between the check and dereference: + +================================================================== +BUG: KCSAN: data-race in drain_all_stock / drain_obj_stock + +write to 0xffff888237c2a2f8 of 8 bytes by task 19625 on cpu 0: + drain_obj_stock+0x408/0x4e0 mm/memcontrol.c:3306 + refill_obj_stock+0x9c/0x1e0 mm/memcontrol.c:3340 + obj_cgroup_uncharge+0xe/0x10 mm/memcontrol.c:3408 + memcg_slab_free_hook mm/slab.h:587 [inline] + __cache_free mm/slab.c:3373 [inline] + __do_kmem_cache_free mm/slab.c:3577 [inline] + kmem_cache_free+0x105/0x280 mm/slab.c:3602 + __d_free fs/dcache.c:298 [inline] + dentry_free fs/dcache.c:375 [inline] + __dentry_kill+0x422/0x4a0 fs/dcache.c:621 + dentry_kill+0x8d/0x1e0 + dput+0x118/0x1f0 fs/dcache.c:913 + __fput+0x3bf/0x570 fs/file_table.c:329 + ____fput+0x15/0x20 fs/file_table.c:349 + task_work_run+0x123/0x160 kernel/task_work.c:179 + resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] + exit_to_user_mode_loop+0xcf/0xe0 kernel/entry/common.c:171 + exit_to_user_mode_prepare+0x6a/0xa0 kernel/entry/common.c:203 + __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] + syscall_exit_to_user_mode+0x26/0x140 kernel/entry/common.c:296 + do_syscall_64+0x4d/0xc0 arch/x86/entry/common.c:86 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +read to 0xffff888237c2a2f8 of 8 bytes by task 19632 on cpu 1: + obj_stock_flush_required mm/memcontrol.c:3319 [inline] + drain_all_stock+0x174/0x2a0 mm/memcontrol.c:2361 + try_charge_memcg+0x6d0/0xd10 mm/memcontrol.c:2703 + try_charge mm/memcontrol.c:2837 [inline] + mem_cgroup_charge_skmem+0x51/0x140 mm/memcontrol.c:7290 + sock_reserve_memory+0xb1/0x390 net/core/sock.c:1025 + sk_setsockopt+0x800/0x1e70 net/core/sock.c:1525 + udp_lib_setsockopt+0x99/0x6c0 net/ipv4/udp.c:2692 + udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2817 + sock_common_setsockopt+0x61/0x70 net/core/sock.c:3668 + __sys_setsockopt+0x1c3/0x230 net/socket.c:2271 + __do_sys_setsockopt net/socket.c:2282 [inline] + __se_sys_setsockopt net/socket.c:2279 [inline] + __x64_sys_setsockopt+0x66/0x80 net/socket.c:2279 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +value changed: 0xffff8881382d52c0 -> 0xffff888138893740 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 1 PID: 19632 Comm: syz-executor.0 Not tainted 6.3.0-rc2-syzkaller-00387-g534293368afa #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 + +Fix it by using READ_ONCE()/WRITE_ONCE() for all accesses to +stock->cached_objcg. + +Link: https://lkml.kernel.org/r/20230502160839.361544-1-roman.gushchin@linux.dev +Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") + Signed-off-by: Roman Gushchin + Reported-by: syzbot+774c29891415ab0fd29d@syzkaller.appspotmail.com + Reported-by: Dmitry Vyukov + Link: https://lore.kernel.org/linux-mm/CACT4Y+ZfucZhM60YPphWiCLJr6+SGFhT+jjm8k1P-a_8Kkxsjg@mail.gmail.com/T/#t + Reviewed-by: Yosry Ahmed + Acked-by: Shakeel Butt + Reviewed-by: Dmitry Vyukov + Signed-off-by: Andrew Morton +(cherry picked from commit 3b8abb3239530c423c0b97e42af7f7e856e1ee96) + Signed-off-by: Jonathan Maple + +# Conflicts: +# mm/memcontrol.c +diff --cc mm/memcontrol.c +index 6e2a077af4c1,c823c35c2ed4..000000000000 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@@ -3272,8 -3208,8 +3272,13 @@@ void mod_objcg_state(struct obj_cgroup + * accumulating over a page of vmstat data or when pgdat or idx + * changes. + */ +++<<<<<<< HEAD + + if (stock->cached_objcg != objcg) { + + drain_obj_stock(stock); +++======= ++ if (READ_ONCE(stock->cached_objcg) != objcg) { ++ old = drain_obj_stock(stock); +++>>>>>>> 3b8abb323953 (mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required()) + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; +@@@ -3322,11 -3260,14 +3327,18 @@@ + + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + { + - struct memcg_stock_pcp *stock; + unsigned long flags; + + struct obj_stock *stock = get_obj_stock(&flags); + bool ret = false; + +++<<<<<<< HEAD + + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { +++======= ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ ++ stock = this_cpu_ptr(&memcg_stock); ++ if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { +++>>>>>>> 3b8abb323953 (mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required()) + stock->nr_bytes -= nr_bytes; + ret = true; + } +@@@ -3336,12 -3277,12 +3348,12 @@@ + return ret; + } + + -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) + +static void drain_obj_stock(struct obj_stock *stock) + { +- struct obj_cgroup *old = stock->cached_objcg; ++ struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); + + if (!old) + - return NULL; + + return; + + if (stock->nr_bytes) { + unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; +@@@ -3383,22 -3332,22 +3395,37 @@@ + stock->cached_pgdat = NULL; + } + +++<<<<<<< HEAD + + obj_cgroup_put(old); + + stock->cached_objcg = NULL; +++======= ++ WRITE_ONCE(stock->cached_objcg, NULL); ++ /* ++ * The `old' objects needs to be released by the caller via ++ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. ++ */ ++ return old; +++>>>>>>> 3b8abb323953 (mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required()) + } + + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) + { ++ struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); + struct mem_cgroup *memcg; + +++<<<<<<< HEAD + + if (in_task() && stock->task_obj.cached_objcg) { + + memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); + + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + + return true; + + } + + if (stock->irq_obj.cached_objcg) { + + memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); +++======= ++ if (objcg) { ++ memcg = obj_cgroup_memcg(objcg); +++>>>>>>> 3b8abb323953 (mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required()) + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } +@@@ -3409,14 -3358,18 +3436,22 @@@ + static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + bool allow_uncharge) + { + - struct memcg_stock_pcp *stock; + - struct obj_cgroup *old = NULL; + unsigned long flags; + + struct obj_stock *stock = get_obj_stock(&flags); + unsigned int nr_pages = 0; + +++<<<<<<< HEAD + + if (stock->cached_objcg != objcg) { /* reset if necessary */ + + drain_obj_stock(stock); +++======= ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ ++ stock = this_cpu_ptr(&memcg_stock); ++ if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ ++ old = drain_obj_stock(stock); +++>>>>>>> 3b8abb323953 (mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required()) + obj_cgroup_get(objcg); +- stock->cached_objcg = objcg; ++ WRITE_ONCE(stock->cached_objcg, objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + allow_uncharge = true; /* Allow uncharge when objcg changes */ +* Unmerged path mm/memcontrol.c From 85b96797a08412b0f7d2aa916927f5a868b99044 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:04 -0500 Subject: [PATCH 8/9] mm: memcg: use READ_ONCE()/WRITE_ONCE() to access stock->cached jira KERNEL-325 Rebuild_History Non-Buildable kernel-4.18.0-553.89.1.el8_10 commit-author Roman Gushchin commit f785a8f21a9cc46fced9f53c51a6f2dc647ed484 A memcg pointer in the percpu stock can be accessed by drain_all_stock() from another cpu in a lockless way. In theory it might lead to an issue, similar to the one which has been discovered with stock->cached_objcg, where the pointer was zeroed between the check for being NULL and dereferencing. In this case the issue is unlikely a real problem, but to make it bulletproof and similar to stock->cached_objcg, let's annotate all accesses to stock->cached with READ_ONCE()/WTRITE_ONCE(). Link: https://lkml.kernel.org/r/20230502160839.361544-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Dmitry Vyukov Cc: Yosry Ahmed Signed-off-by: Andrew Morton (cherry picked from commit f785a8f21a9cc46fced9f53c51a6f2dc647ed484) Signed-off-by: Jonathan Maple --- mm/memcontrol.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e2a077af4c16..f64d93284c340 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2307,7 +2307,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) { + if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } @@ -2322,7 +2322,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_stock(struct memcg_stock_pcp *stock) { - struct mem_cgroup *old = stock->cached; + struct mem_cgroup *old = READ_ONCE(stock->cached); if (!old) return; @@ -2335,7 +2335,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) } css_put(&old->css); - stock->cached = NULL; + WRITE_ONCE(stock->cached, NULL); } static void drain_local_stock(struct work_struct *dummy) @@ -2371,10 +2371,10 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); - if (stock->cached != memcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ drain_stock(stock); css_get(&memcg->css); - stock->cached = memcg; + WRITE_ONCE(stock->cached, memcg); } stock->nr_pages += nr_pages; @@ -2408,7 +2408,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) bool flush = false; rcu_read_lock(); - memcg = stock->cached; + memcg = READ_ONCE(stock->cached); if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; From a8a2edf9cea8524bb4bb1e99d6021af5e75d6a41 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 11 Dec 2025 06:02:14 -0500 Subject: [PATCH 9/9] Rebuild rocky8_10 with kernel-4.18.0-553.89.1.el8_10 Rebuild_History BUILDABLE Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% Number of commits in upstream range v4.18~1..kernel-mainline: 581414 Number of commits in rpm: 14 Number of commits matched with upstream: 8 (57.14%) Number of commits in upstream but not in rpm: 581406 Number of commits NOT found in upstream: 6 (42.86%) Rebuilding Kernel on Branch rocky8_10_rebuild_kernel-4.18.0-553.89.1.el8_10 for kernel-4.18.0-553.89.1.el8_10 Clean Cherry Picks: 4 (50.00%) Empty Cherry Picks: 4 (50.00%) _______________________________ Full Details Located here: ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/rebuild.details.txt Includes: * git commit header above * Empty Commits with upstream SHA * RPM ChangeLog Entries that could not be matched Individual Empty Commit failures contained in the same containing directory. The git message for empty commits will have the path for the failed commit. File names are the first 8 characters of the upstream SHA --- Makefile.rhelver | 2 +- .../rebuild.details.txt | 26 +++++ configs/kernel-4.18.0-aarch64-debug.config | 2 +- configs/kernel-4.18.0-aarch64.config | 2 +- configs/kernel-4.18.0-ppc64le-debug.config | 2 +- configs/kernel-4.18.0-ppc64le.config | 2 +- configs/kernel-4.18.0-s390x-debug.config | 2 +- configs/kernel-4.18.0-s390x-zfcpdump.config | 2 +- configs/kernel-4.18.0-s390x.config | 2 +- configs/kernel-4.18.0-x86_64-debug.config | 2 +- configs/kernel-4.18.0-x86_64.config | 2 +- drivers/infiniband/sw/rxe/rxe_req.c | 45 ++++---- fs/gfs2/file.c | 19 +++- fs/gfs2/glock.c | 5 +- fs/gfs2/incore.h | 2 + fs/gfs2/lock_dlm.c | 46 ++++++-- mm/memcontrol.c | 104 ++++++------------ 17 files changed, 148 insertions(+), 119 deletions(-) create mode 100644 ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/rebuild.details.txt diff --git a/Makefile.rhelver b/Makefile.rhelver index 47a6bb1e4f1b9..76dfd0e62db5d 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 10 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 553.87.1 +RHEL_RELEASE = 553.89.1 # # ZSTREAM diff --git a/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/rebuild.details.txt b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/rebuild.details.txt new file mode 100644 index 0000000000000..ac9cf26b13415 --- /dev/null +++ b/ciq/ciq_backports/kernel-4.18.0-553.89.1.el8_10/rebuild.details.txt @@ -0,0 +1,26 @@ +Rebuild_History BUILDABLE +Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% +Number of commits in upstream range v4.18~1..kernel-mainline: 581414 +Number of commits in rpm: 14 +Number of commits matched with upstream: 8 (57.14%) +Number of commits in upstream but not in rpm: 581406 +Number of commits NOT found in upstream: 6 (42.86%) + +Rebuilding Kernel on Branch rocky8_10_rebuild_kernel-4.18.0-553.89.1.el8_10 for kernel-4.18.0-553.89.1.el8_10 +Clean Cherry Picks: 4 (50.00%) +Empty Cherry Picks: 4 (50.00%) +_______________________________ + +__EMPTY COMMITS__________________________ +5d122db2ff80cd2aed4dcd630befb56b51ddf947 RDMA/rxe: Fix incomplete state save in rxe_requester +6ab26555c9ffef96c56ca16356e55ac5ab61ec93 gfs2: Add proper lockspace locking +fead2b869764f89d524b79dc8862e61d5191be55 mm/memcg: revert ("mm/memcg: optimize user context object stock access") +3b8abb3239530c423c0b97e42af7f7e856e1ee96 mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required() + +__CHANGES NOT IN UPSTREAM________________ +Adding prod certs and changed cert date to 20210620 +Adding Rocky secure boot certs +Fixing vmlinuz removal +Fixing UEFI CA path +Porting to 8.10, debranding and Rocky branding +Fixing pesign_key_name values diff --git a/configs/kernel-4.18.0-aarch64-debug.config b/configs/kernel-4.18.0-aarch64-debug.config index 255d2aa8334b0..4653989a09fbc 100644 --- a/configs/kernel-4.18.0-aarch64-debug.config +++ b/configs/kernel-4.18.0-aarch64-debug.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_ARM64=y CONFIG_64BIT=y diff --git a/configs/kernel-4.18.0-aarch64.config b/configs/kernel-4.18.0-aarch64.config index 43aa0941d9e53..3a92f34063d74 100644 --- a/configs/kernel-4.18.0-aarch64.config +++ b/configs/kernel-4.18.0-aarch64.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_ARM64=y CONFIG_64BIT=y diff --git a/configs/kernel-4.18.0-ppc64le-debug.config b/configs/kernel-4.18.0-ppc64le-debug.config index e972ad333334f..91ab00c30f4cc 100644 --- a/configs/kernel-4.18.0-ppc64le-debug.config +++ b/configs/kernel-4.18.0-ppc64le-debug.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_PPC64=y diff --git a/configs/kernel-4.18.0-ppc64le.config b/configs/kernel-4.18.0-ppc64le.config index 497d687931d2f..d44cfdbd007a4 100644 --- a/configs/kernel-4.18.0-ppc64le.config +++ b/configs/kernel-4.18.0-ppc64le.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_PPC64=y diff --git a/configs/kernel-4.18.0-s390x-debug.config b/configs/kernel-4.18.0-s390x-debug.config index b8133c4154bed..f930210e545d8 100644 --- a/configs/kernel-4.18.0-s390x-debug.config +++ b/configs/kernel-4.18.0-s390x-debug.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_MMU=y CONFIG_ZONE_DMA=y diff --git a/configs/kernel-4.18.0-s390x-zfcpdump.config b/configs/kernel-4.18.0-s390x-zfcpdump.config index d164adf552e14..c076043ca8466 100644 --- a/configs/kernel-4.18.0-s390x-zfcpdump.config +++ b/configs/kernel-4.18.0-s390x-zfcpdump.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_MMU=y CONFIG_ZONE_DMA=y diff --git a/configs/kernel-4.18.0-s390x.config b/configs/kernel-4.18.0-s390x.config index f0be5118f855b..9423959add43e 100644 --- a/configs/kernel-4.18.0-s390x.config +++ b/configs/kernel-4.18.0-s390x.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_MMU=y CONFIG_ZONE_DMA=y diff --git a/configs/kernel-4.18.0-x86_64-debug.config b/configs/kernel-4.18.0-x86_64-debug.config index 8c8a005c1e9ff..30086a3d7ef0a 100644 --- a/configs/kernel-4.18.0-x86_64-debug.config +++ b/configs/kernel-4.18.0-x86_64-debug.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_64BIT=y CONFIG_X86_64=y diff --git a/configs/kernel-4.18.0-x86_64.config b/configs/kernel-4.18.0-x86_64.config index a3441083602f2..4c9255b6f937c 100644 --- a/configs/kernel-4.18.0-x86_64.config +++ b/configs/kernel-4.18.0-x86_64.config @@ -5,7 +5,7 @@ # # -# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) +# Compiler: gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-11) # CONFIG_64BIT=y CONFIG_X86_64=y diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index f637712079705..d66bebc80fcf6 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -529,10 +529,11 @@ static void save_state(struct rxe_send_wqe *wqe, struct rxe_send_wqe *rollback_wqe, u32 *rollback_psn) { - rollback_wqe->state = wqe->state; + rollback_wqe->state = wqe->state; rollback_wqe->first_psn = wqe->first_psn; - rollback_wqe->last_psn = wqe->last_psn; - *rollback_psn = qp->req.psn; + rollback_wqe->last_psn = wqe->last_psn; + rollback_wqe->dma = wqe->dma; + *rollback_psn = qp->req.psn; } static void rollback_state(struct rxe_send_wqe *wqe, @@ -540,10 +541,11 @@ static void rollback_state(struct rxe_send_wqe *wqe, struct rxe_send_wqe *rollback_wqe, u32 rollback_psn) { - wqe->state = rollback_wqe->state; + wqe->state = rollback_wqe->state; wqe->first_psn = rollback_wqe->first_psn; - wqe->last_psn = rollback_wqe->last_psn; - qp->req.psn = rollback_psn; + wqe->last_psn = rollback_wqe->last_psn; + wqe->dma = rollback_wqe->dma; + qp->req.psn = rollback_psn; } static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) @@ -746,6 +748,9 @@ int rxe_requester(void *arg) pkt.mask = rxe_opcode[opcode].mask; pkt.wqe = wqe; + /* save wqe state before we build and send packet */ + save_state(wqe, qp, &rollback_wqe, &rollback_psn); + av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { pr_err("qp#%d Failed no address vector\n", qp_num(qp)); @@ -778,29 +783,29 @@ int rxe_requester(void *arg) if (ah) rxe_put(ah); - /* - * To prevent a race on wqe access between requester and completer, - * wqe members state and psn need to be set before calling - * rxe_xmit_packet(). - * Otherwise, completer might initiate an unjustified retry flow. - */ - save_state(wqe, qp, &rollback_wqe, &rollback_psn); + /* update wqe state as though we had sent it */ update_wqe_state(qp, wqe, &pkt); update_wqe_psn(qp, wqe, &pkt, payload); err = rxe_xmit_packet(qp, &pkt, skb); if (err) { - qp->need_req_skb = 1; + if (err != -EAGAIN) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } + /* the packet was dropped so reset wqe to the state + * before we sent it so we can try to resend + */ rollback_state(wqe, qp, &rollback_wqe, rollback_psn); - if (err == -EAGAIN) { - rxe_run_task(&qp->req.task, 1); - goto exit; - } + /* force a delay until the dropped packet is freed and + * the send queue is drained below the low water mark + */ + qp->need_req_skb = 1; - wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err; + rxe_run_task(&qp->req.task, 1); + goto exit; } update_state(qp, &pkt); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index dd07e0f60888a..6aabdb450ddd9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1414,6 +1414,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int ret; if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; @@ -1430,12 +1431,18 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) locks_lock_file_wait(file, fl); return -EIO; } - if (IS_GETLK(cmd)) - return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (fl->fl_type == F_UNLCK) - return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); - else - return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + down_read(&ls->ls_sem); + ret = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + if (IS_GETLK(cmd)) + ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (fl->fl_type == F_UNLCK) + ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + } + up_read(&ls->ls_sem); + return ret; } static int do_flock(struct file *file, int cmd, struct file_lock *fl) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index de182757764bc..8581006a10230 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -827,9 +827,8 @@ __acquires(&gl->gl_lockref.lock) } clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); - if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && - target == LM_ST_UNLOCKED && - test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED) { /* * The lockspace has been released and the lock has * been unlocked implicitly. diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6eca783f0691d..36018c48b8f22 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -665,6 +665,8 @@ struct lm_lockstruct { struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ char *ls_lvb_bits; + struct rw_semaphore ls_sem; + spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ uint32_t ls_recover_mount; /* gen in first recover_done cb */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0579fdbc9c633..fabc797f5ed5c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -294,8 +294,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, */ again: - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -335,8 +340,13 @@ static void gdlm_put_lock(struct gfs2_glock *gl) } again: - error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, - NULL, gl); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, + DLM_LKF_VALBLK, NULL, gl); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -352,7 +362,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) static void gdlm_cancel(struct gfs2_glock *gl) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + + down_read(&ls->ls_sem); + if (likely(ls->ls_dlm != NULL)) { + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + } + up_read(&ls->ls_sem); } /* @@ -533,7 +548,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x error %d\n", name, lksb->sb_lkid, error); @@ -560,9 +579,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, memset(strname, 0, GDLM_STRNAME_BYTES); snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); - error = dlm_lock(ls->ls_dlm, mode, lksb, flags, - strname, GDLM_STRNAME_BYTES - 1, - 0, sync_wait_cb, ls, NULL); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + } + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", name, lksb->sb_lkid, flags, mode, error); @@ -1288,6 +1312,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) */ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + ls->ls_dlm = NULL; spin_lock_init(&ls->ls_recover_spin); ls->ls_recover_flags = 0; ls->ls_recover_mount = 0; @@ -1322,6 +1347,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) * create/join lockspace */ + init_rwsem(&ls->ls_sem); error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, &gdlm_lockspace_ops, sdp, &ops_result, &ls->ls_dlm); @@ -1401,10 +1427,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: + down_write(&ls->ls_sem); if (ls->ls_dlm) { dlm_release_lockspace(ls->ls_dlm, 2); ls->ls_dlm = NULL; } + up_write(&ls->ls_sem); free_recover_size(ls); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f64d93284c340..6bcc5a1cfd39c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2243,23 +2243,17 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -struct obj_stock { +struct memcg_stock_pcp { + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#else - int dummy[0]; #endif -}; - -struct memcg_stock_pcp { - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; - struct obj_stock task_obj; - struct obj_stock irq_obj; struct work_struct work; unsigned long flags; @@ -2269,12 +2263,12 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct obj_stock *stock); +static void drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); #else -static inline void drain_obj_stock(struct obj_stock *stock) +static inline void drain_obj_stock(struct memcg_stock_pcp *stock) { } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -2350,9 +2344,7 @@ static void drain_local_stock(struct work_struct *dummy) local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(&stock->irq_obj); - if (in_task()) - drain_obj_stock(&stock->task_obj); + drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); @@ -2950,41 +2942,6 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) */ #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) -/* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - /* * mod_objcg_mlstate() may be called with irq enabled, so * mod_memcg_lruvec_state() should be used. @@ -3263,21 +3220,24 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); int *bytes; + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx * changes. */ - if (stock->cached_objcg != objcg) { + if (READ_ONCE(stock->cached_objcg) != objcg) { drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ @@ -3317,28 +3277,31 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - put_obj_stock(flags); + local_irq_restore(flags); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); bool ret = false; - if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); + if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - put_obj_stock(flags); + local_irq_restore(flags); return ret; } -static void drain_obj_stock(struct obj_stock *stock) +static void drain_obj_stock(struct memcg_stock_pcp *stock) { - struct obj_cgroup *old = stock->cached_objcg; + struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) return; @@ -3384,21 +3347,17 @@ static void drain_obj_stock(struct obj_stock *stock) } obj_cgroup_put(old); - stock->cached_objcg = NULL; + WRITE_ONCE(stock->cached_objcg, NULL); } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) { + struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; - if (in_task() && stock->task_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; - } - if (stock->irq_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); + if (objcg) { + memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3409,14 +3368,17 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); unsigned int nr_pages = 0; - if (stock->cached_objcg != objcg) { /* reset if necessary */ + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); + if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; allow_uncharge = true; /* Allow uncharge when objcg changes */ @@ -3428,7 +3390,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - put_obj_stock(flags); + local_irq_restore(flags); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages);