Persistent buffer cleanups and simplifications for v6.15:

It was mistaken that the physical memory returned from "reserve_mem" had to
 be vmap()'d to get to it from a virtual address. But reserve_mem already
 maps the memory to the virtual address of the kernel so a simple
 phys_to_virt() can be used to get to the virtual address from the physical
 memory returned by "reserve_mem". With this new found knowledge, the
 code can be cleaned up and simplified.
 
 - Enforce that the persistent memory is page aligned
 
   As the buffers using the persistent memory are all going to be
   mapped via pages, make sure that the memory given to the tracing
   infrastructure is page aligned. If it is not, it will print a warning
   and fail to map the buffer.
 
 - Use phys_to_virt() to get the virtual address from reserve_mem
 
   Instead of calling vmap() on the physical memory returned from
   "reserve_mem", use phys_to_virt() instead.
 
   As the memory returned by "memmap" or any other means where a physical
   address is given to the tracing infrastructure, it still needs to
   be vmap(). Since this memory can never be returned back to the buddy
   allocator nor should it ever be memmory mapped to user space, flag
   this buffer and up the ref count. The ref count will keep it from
   ever being freed, and the flag will prevent it from ever being memory
   mapped to user space.
 
 - Use vmap_page_range() for memmap virtual address mapping
 
   For the memmap buffer, instead of allocating an array of struct pages,
   assigning them to the contiguous phsycial memory and then passing that to
   vmap(), use vmap_page_range() instead
 
 - Replace flush_dcache_folio() with flush_kernel_vmap_range()
 
   Instead of calling virt_to_folio() and passing that to
   flush_dcache_folio(), just call flush_kernel_vmap_range() directly.
   This also fixes a bug where if a subbuffer was bigger than PAGE_SIZE
   only the PAGE_SIZE portion would be flushed.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYIADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCZ+6oZRQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qhq6AP481KHAgaowQCg7zrKPkMlbYBIigYoU
 7aqoAg2rSLBRSQEAl8fViHZgZ9Q+O7xdozQWiIR7/KQW8VIaTcP/V7cHkAU=
 =+5JB
 -----END PGP SIGNATURE-----

Merge tag 'trace-ringbuffer-v6.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull ring-buffer updates from Steven Rostedt:
 "Persistent buffer cleanups and simplifications.

  It was mistaken that the physical memory returned from "reserve_mem"
  had to be vmap()'d to get to it from a virtual address. But
  reserve_mem already maps the memory to the virtual address of the
  kernel so a simple phys_to_virt() can be used to get to the virtual
  address from the physical memory returned by "reserve_mem". With this
  new found knowledge, the code can be cleaned up and simplified.

   - Enforce that the persistent memory is page aligned

     As the buffers using the persistent memory are all going to be
     mapped via pages, make sure that the memory given to the tracing
     infrastructure is page aligned. If it is not, it will print a
     warning and fail to map the buffer.

   - Use phys_to_virt() to get the virtual address from reserve_mem

     Instead of calling vmap() on the physical memory returned from
     "reserve_mem", use phys_to_virt() instead.

     As the memory returned by "memmap" or any other means where a
     physical address is given to the tracing infrastructure, it still
     needs to be vmap(). Since this memory can never be returned back to
     the buddy allocator nor should it ever be memmory mapped to user
     space, flag this buffer and up the ref count. The ref count will
     keep it from ever being freed, and the flag will prevent it from
     ever being memory mapped to user space.

   - Use vmap_page_range() for memmap virtual address mapping

     For the memmap buffer, instead of allocating an array of struct
     pages, assigning them to the contiguous phsycial memory and then
     passing that to vmap(), use vmap_page_range() instead

   - Replace flush_dcache_folio() with flush_kernel_vmap_range()

     Instead of calling virt_to_folio() and passing that to
     flush_dcache_folio(), just call flush_kernel_vmap_range() directly.
     This also fixes a bug where if a subbuffer was bigger than
     PAGE_SIZE only the PAGE_SIZE portion would be flushed"

* tag 'trace-ringbuffer-v6.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  ring-buffer: Use flush_kernel_vmap_range() over flush_dcache_folio()
  tracing: Use vmap_page_range() to map memmap ring buffer
  tracing: Have reserve_mem use phys_to_virt() and separate from memmap buffer
  tracing: Enforce the persistent ring buffer to be page aligned
This commit is contained in:
Linus Torvalds 2025-04-03 16:09:29 -07:00
commit 6cb0bd94c0
5 changed files with 51 additions and 27 deletions

View File

@ -7288,6 +7288,8 @@
This is just one of many ways that can clear memory. Make sure your system
keeps the content of memory across reboots before relying on this option.
NB: Both the mapped address and size must be page aligned for the architecture.
See also Documentation/trace/debugging.rst

View File

@ -136,6 +136,8 @@ kernel, so only the same kernel is guaranteed to work if the mapping is
preserved. Switching to a different kernel version may find a different
layout and mark the buffer as invalid.
NB: Both the mapped address and size must be page aligned for the architecture.
Using trace_printk() in the boot instance
-----------------------------------------
By default, the content of trace_printk() goes into the top level tracing

View File

@ -6016,7 +6016,7 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
meta->read = cpu_buffer->read;
/* Some archs do not have data cache coherency between kernel and user-space */
flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
}
static void
@ -7319,7 +7319,8 @@ consume:
out:
/* Some archs do not have data cache coherency between kernel and user-space */
flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
flush_kernel_vmap_range(cpu_buffer->reader_page->page,
buffer->subbuf_size + BUF_PAGE_HDR_SIZE);
rb_update_meta_page(cpu_buffer);

View File

@ -50,6 +50,7 @@
#include <linux/irq_work.h>
#include <linux/workqueue.h>
#include <linux/sort.h>
#include <linux/io.h> /* vmap_page_range() */
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
@ -8500,6 +8501,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
struct trace_iterator *iter = &info->iter;
int ret = 0;
/* A memmap'ed buffer is not supported for user space mmap */
if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP)
return -ENODEV;
/* Currently the boot mapped buffer is not supported for mmap */
if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
return -ENODEV;
@ -9609,9 +9614,6 @@ static void free_trace_buffers(struct trace_array *tr)
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
#endif
if (tr->range_addr_start)
vunmap((void *)tr->range_addr_start);
}
static void init_trace_flags_index(struct trace_array *tr)
@ -9804,29 +9806,27 @@ static int instance_mkdir(const char *name)
return ret;
}
static u64 map_pages(u64 start, u64 size)
static u64 map_pages(unsigned long start, unsigned long size)
{
struct page **pages;
phys_addr_t page_start;
unsigned int page_count;
unsigned int i;
void *vaddr;
unsigned long vmap_start, vmap_end;
struct vm_struct *area;
int ret;
page_count = DIV_ROUND_UP(size, PAGE_SIZE);
page_start = start;
pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
area = get_vm_area(size, VM_IOREMAP);
if (!area)
return 0;
for (i = 0; i < page_count; i++) {
phys_addr_t addr = page_start + i * PAGE_SIZE;
pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
}
vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
kfree(pages);
vmap_start = (unsigned long) area->addr;
vmap_end = vmap_start + size;
return (u64)(unsigned long)vaddr;
ret = vmap_page_range(vmap_start, vmap_end,
start, pgprot_nx(PAGE_KERNEL));
if (ret < 0) {
free_vm_area(area);
return 0;
}
return (u64)vmap_start;
}
/**
@ -10705,6 +10705,7 @@ static inline void do_allocate_snapshot(const char *name) { }
__init static void enable_instances(void)
{
struct trace_array *tr;
bool memmap_area = false;
char *curr_str;
char *name;
char *str;
@ -10773,6 +10774,7 @@ __init static void enable_instances(void)
name);
continue;
}
memmap_area = true;
} else if (tok) {
if (!reserve_mem_find_by_name(tok, &start, &size)) {
start = 0;
@ -10783,7 +10785,20 @@ __init static void enable_instances(void)
}
if (start) {
addr = map_pages(start, size);
/* Start and size must be page aligned */
if (start & ~PAGE_MASK) {
pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start);
continue;
}
if (size & ~PAGE_MASK) {
pr_warn("Tracing: mapping size %pa is not page aligned\n", &size);
continue;
}
if (memmap_area)
addr = map_pages(start, size);
else
addr = (unsigned long)phys_to_virt(start);
if (addr) {
pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
name, &start, (unsigned long)size);
@ -10810,10 +10825,13 @@ __init static void enable_instances(void)
update_printk_trace(tr);
/*
* If start is set, then this is a mapped buffer, and
* cannot be deleted by user space, so keep the reference
* to it.
* memmap'd buffers can not be freed.
*/
if (memmap_area) {
tr->flags |= TRACE_ARRAY_FL_MEMMAP;
tr->ref++;
}
if (start) {
tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
tr->range_name = no_free_ptr(rname);

View File

@ -447,6 +447,7 @@ enum {
TRACE_ARRAY_FL_BOOT = BIT(1),
TRACE_ARRAY_FL_LAST_BOOT = BIT(2),
TRACE_ARRAY_FL_MOD_INIT = BIT(3),
TRACE_ARRAY_FL_MEMMAP = BIT(4),
};
#ifdef CONFIG_MODULES