aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2020-12-16 18:03:12 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2020-12-16 18:03:13 +1100
commit6c2d4a068412ac302cfe69845c3c8b2bad5d3aec (patch)
tree2f6bde4198c3649b60d70b629926436bf3bd2797
parente913769e3d45ffa0f33d99197ba307c1ef15bd7d (diff)
parente336550f6bf3a539a1536dca36cadcc130d16644 (diff)
downloadlinux-6c2d4a068412ac302cfe69845c3c8b2bad5d3aec.tar.gz
Merge branch 'akpm/master'
-rw-r--r--Documentation/dev-tools/kasan.rst262
-rw-r--r--arch/Kconfig8
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/boot/compressed/string.c6
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/Kconfig9
-rw-r--r--arch/arm64/Makefile7
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/assembler.h2
-rw-r--r--arch/arm64/include/asm/cache.h3
-rw-r--r--arch/arm64/include/asm/cacheflush.h6
-rw-r--r--arch/arm64/include/asm/esr.h1
-rw-r--r--arch/arm64/include/asm/kasan.h5
-rw-r--r--arch/arm64/include/asm/kfence.h2
-rw-r--r--arch/arm64/include/asm/memory.h15
-rw-r--r--arch/arm64/include/asm/mte-def.h14
-rw-r--r--arch/arm64/include/asm/mte-kasan.h67
-rw-r--r--arch/arm64/include/asm/mte.h22
-rw-r--r--arch/arm64/include/asm/processor.h2
-rw-r--r--arch/arm64/include/asm/set_memory.h17
-rw-r--r--arch/arm64/include/asm/string.h5
-rw-r--r--arch/arm64/include/asm/uaccess.h23
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/arm64/include/uapi/asm/unistd.h1
-rw-r--r--arch/arm64/kernel/asm-offsets.c3
-rw-r--r--arch/arm64/kernel/cpufeature.c3
-rw-r--r--arch/arm64/kernel/entry.S41
-rw-r--r--arch/arm64/kernel/head.S2
-rw-r--r--arch/arm64/kernel/hibernate.c5
-rw-r--r--arch/arm64/kernel/image-vars.h2
-rw-r--r--arch/arm64/kernel/kaslr.c3
-rw-r--r--arch/arm64/kernel/machine_kexec.c1
-rw-r--r--arch/arm64/kernel/module.c6
-rw-r--r--arch/arm64/kernel/mte.c118
-rw-r--r--arch/arm64/kernel/setup.c2
-rw-r--r--arch/arm64/kernel/sleep.S2
-rw-r--r--arch/arm64/kernel/smp.c2
-rw-r--r--arch/arm64/lib/mte.S16
-rw-r--r--arch/arm64/mm/copypage.c9
-rw-r--r--arch/arm64/mm/fault.c59
-rw-r--r--arch/arm64/mm/kasan_init.c19
-rw-r--r--arch/arm64/mm/mmu.c6
-rw-r--r--arch/arm64/mm/mteswap.c9
-rw-r--r--arch/arm64/mm/pageattr.c23
-rw-r--r--arch/arm64/mm/proc.S23
-rw-r--r--arch/arm64/mm/ptdump.c6
-rw-r--r--arch/ia64/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/riscv/include/asm/set_memory.h4
-rw-r--r--arch/riscv/include/asm/unistd.h1
-rw-r--r--arch/riscv/mm/pageattr.c8
-rw-r--r--arch/s390/boot/string.c1
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/pci/pci_mmio.c4
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/boot/compressed/string.c6
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/include/asm/set_memory.h4
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/mm/pat/set_memory.c8
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--drivers/firmware/efi/runtime-wrappers.c2
-rw-r--r--fs/dax.c20
-rw-r--r--fs/eventpoll.c287
-rw-r--r--fs/exec.c8
-rw-r--r--include/linux/compat.h6
-rw-r--r--include/linux/compiler_attributes.h2
-rw-r--r--include/linux/kasan-checks.h2
-rw-r--r--include/linux/kasan.h359
-rw-r--r--include/linux/memcontrol.h156
-rw-r--r--include/linux/mm.h30
-rw-r--r--include/linux/mm_types.h10
-rw-r--r--include/linux/mmap_lock.h16
-rw-r--r--include/linux/mmdebug.h13
-rw-r--r--include/linux/moduleloader.h3
-rw-r--r--include/linux/page-flags-layout.h2
-rw-r--r--include/linux/pgtable.h3
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/secretmem.h30
-rw-r--r--include/linux/set_memory.h16
-rw-r--r--include/linux/string.h2
-rw-r--r--include/linux/syscalls.h6
-rw-r--r--include/uapi/asm-generic/unistd.h8
-rw-r--r--include/uapi/linux/magic.h1
-rw-r--r--init/init_task.c2
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/kcsan/core.c10
-rw-r--r--kernel/power/hibernate.c5
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--lib/Kconfig.kasan65
-rw-r--r--lib/crc32.c4
-rw-r--r--lib/crypto/aes.c4
-rw-r--r--lib/test_kasan.c2
-rw-r--r--lib/test_kasan_module.c2
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile1
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/gup.c12
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/internal.h3
-rw-r--r--mm/kasan/Makefile25
-rw-r--r--mm/kasan/common.c834
-rw-r--r--mm/kasan/generic.c80
-rw-r--r--mm/kasan/generic_report.c165
-rw-r--r--mm/kasan/hw_tags.c204
-rw-r--r--mm/kasan/init.c17
-rw-r--r--mm/kasan/kasan.h182
-rw-r--r--mm/kasan/quarantine.c31
-rw-r--r--mm/kasan/report.c317
-rw-r--r--mm/kasan/report_generic.c327
-rw-r--r--mm/kasan/report_hw_tags.c42
-rw-r--r--mm/kasan/report_sw_tags.c (renamed from mm/kasan/tags_report.c)29
-rw-r--r--mm/kasan/shadow.c517
-rw-r--r--mm/kasan/sw_tags.c (renamed from mm/kasan/tags.c)39
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c74
-rw-r--r--mm/memory.c36
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/ptdump.c13
-rw-r--r--mm/secretmem.c439
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h14
-rw-r--r--mm/slab_common.c5
-rw-r--r--mm/slub.c34
-rw-r--r--mm/vmalloc.c5
-rw-r--r--scripts/Makefile.lib2
-rwxr-xr-xscripts/checksyscalls.sh4
-rw-r--r--security/apparmor/apparmorfs.c3
-rw-r--r--tools/testing/selftests/arm64/mte/Makefile2
-rw-r--r--tools/testing/selftests/arm64/mte/check_gcr_el1_cswitch.c155
-rw-r--r--tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c72
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile3
-rw-r--r--tools/testing/selftests/vm/memfd_secret.c296
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests17
-rw-r--r--virt/kvm/coalesced_mmio.c2
-rw-r--r--virt/kvm/kvm_main.c2
158 files changed, 4144 insertions, 1908 deletions
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 748149bca8d8..0fc3fb1860c4 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -4,13 +4,16 @@ The Kernel Address Sanitizer (KASAN)
Overview
--------
-KernelAddressSANitizer (KASAN) is a dynamic memory error detector designed to
-find out-of-bound and use-after-free bugs. KASAN has two modes: generic KASAN
-(similar to userspace ASan) and software tag-based KASAN (similar to userspace
-HWASan).
+KernelAddressSANitizer (KASAN) is a dynamic memory safety error detector
+designed to find out-of-bound and use-after-free bugs. KASAN has three modes:
-KASAN uses compile-time instrumentation to insert validity checks before every
-memory access, and therefore requires a compiler version that supports that.
+1. generic KASAN (similar to userspace ASan),
+2. software tag-based KASAN (similar to userspace HWASan),
+3. hardware tag-based KASAN (based on hardware memory tagging).
+
+Software KASAN modes (1 and 2) use compile-time instrumentation to insert
+validity checks before every memory access, and therefore require a compiler
+version that supports that.
Generic KASAN is supported in both GCC and Clang. With GCC it requires version
8.3.0 or later. Any supported Clang version is compatible, but detection of
@@ -19,7 +22,7 @@ out-of-bounds accesses for global variables is only supported since Clang 11.
Tag-based KASAN is only supported in Clang.
Currently generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390
-and riscv architectures, and tag-based KASAN is supported only for arm64.
+and riscv architectures, and tag-based KASAN modes are supported only for arm64.
Usage
-----
@@ -28,30 +31,22 @@ To enable KASAN configure kernel with::
CONFIG_KASAN = y
-and choose between CONFIG_KASAN_GENERIC (to enable generic KASAN) and
-CONFIG_KASAN_SW_TAGS (to enable software tag-based KASAN).
+and choose between CONFIG_KASAN_GENERIC (to enable generic KASAN),
+CONFIG_KASAN_SW_TAGS (to enable software tag-based KASAN), and
+CONFIG_KASAN_HW_TAGS (to enable hardware tag-based KASAN).
+
+For software modes, you also need to choose between CONFIG_KASAN_OUTLINE and
+CONFIG_KASAN_INLINE. Outline and inline are compiler instrumentation types.
+The former produces smaller binary while the latter is 1.1 - 2 times faster.
-You also need to choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE.
-Outline and inline are compiler instrumentation types. The former produces
-smaller binary while the latter is 1.1 - 2 times faster.
+Both software KASAN modes work with both SLUB and SLAB memory allocators,
+while the hardware tag-based KASAN currently only support SLUB.
-Both KASAN modes work with both SLUB and SLAB memory allocators.
-For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
+For better error reports that include stack traces, enable CONFIG_STACKTRACE.
To augment reports with last allocation and freeing stack of the physical page,
it is recommended to enable also CONFIG_PAGE_OWNER and boot with page_owner=on.
-To disable instrumentation for specific files or directories, add a line
-similar to the following to the respective kernel Makefile:
-
-- For a single file (e.g. main.o)::
-
- KASAN_SANITIZE_main.o := n
-
-- For all files in one directory::
-
- KASAN_SANITIZE := n
-
Error reports
~~~~~~~~~~~~~
@@ -136,22 +131,75 @@ freed (in case of a use-after-free bug report). Next comes a description of
the accessed slab object and information about the accessed memory page.
In the last section the report shows memory state around the accessed address.
-Reading this part requires some understanding of how KASAN works.
-
-The state of each 8 aligned bytes of memory is encoded in one shadow byte.
-Those 8 bytes can be accessible, partially accessible, freed or be a redzone.
-We use the following encoding for each shadow byte: 0 means that all 8 bytes
-of the corresponding memory region are accessible; number N (1 <= N <= 7) means
-that the first N bytes are accessible, and other (8 - N) bytes are not;
-any negative value indicates that the entire 8-byte word is inaccessible.
-We use different negative values to distinguish between different kinds of
-inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h).
+Internally KASAN tracks memory state separately for each memory granule, which
+is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the
+memory state section of the report shows the state of one of the memory
+granules that surround the accessed address.
+
+For generic KASAN the size of each memory granule is 8. The state of each
+granule is encoded in one shadow byte. Those 8 bytes can be accessible,
+partially accessible, freed or be a part of a redzone. KASAN uses the following
+encoding for each shadow byte: 0 means that all 8 bytes of the corresponding
+memory region are accessible; number N (1 <= N <= 7) means that the first N
+bytes are accessible, and other (8 - N) bytes are not; any negative value
+indicates that the entire 8-byte word is inaccessible. KASAN uses different
+negative values to distinguish between different kinds of inaccessible memory
+like redzones or freed memory (see mm/kasan/kasan.h).
In the report above the arrows point to the shadow byte 03, which means that
the accessed address is partially accessible.
For tag-based KASAN this last report section shows the memory tags around the
-accessed address (see Implementation details section).
+accessed address (see `Implementation details`_ section).
+
+Boot parameters
+~~~~~~~~~~~~~~~
+
+Hardware tag-based KASAN mode (see the section about different mode below) is
+intended for use in production as a security mitigation. Therefore it supports
+boot parameters that allow to disable KASAN competely or otherwise control
+particular KASAN features.
+
+The things that can be controlled are:
+
+1. Whether KASAN is enabled at all.
+2. Whether KASAN collects and saves alloc/free stacks.
+3. Whether KASAN panics on a detected bug or not.
+
+The ``kasan.mode`` boot parameter allows to choose one of three main modes:
+
+- ``kasan.mode=off`` - KASAN is disabled, no tag checks are performed
+- ``kasan.mode=prod`` - only essential production features are enabled
+- ``kasan.mode=full`` - all KASAN features are enabled
+
+The chosen mode provides default control values for the features mentioned
+above. However it's also possible to override the default values by providing:
+
+- ``kasan.stacktrace=off`` or ``=on`` - enable alloc/free stack collection
+ (default: ``on`` for ``mode=full``,
+ otherwise ``off``)
+- ``kasan.fault=report`` or ``=panic`` - only print KASAN report or also panic
+ (default: ``report``)
+
+If ``kasan.mode`` parameter is not provided, it defaults to ``full`` when
+``CONFIG_DEBUG_KERNEL`` is enabled, and to ``prod`` otherwise.
+
+For developers
+~~~~~~~~~~~~~~
+
+Software KASAN modes use compiler instrumentation to insert validity checks.
+Such instrumentation might be incompatible with some part of the kernel, and
+therefore needs to be disabled. To disable instrumentation for specific files
+or directories, add a line similar to the following to the respective kernel
+Makefile:
+
+- For a single file (e.g. main.o)::
+
+ KASAN_SANITIZE_main.o := n
+
+- For all files in one directory::
+
+ KASAN_SANITIZE := n
Implementation details
@@ -160,10 +208,10 @@ Implementation details
Generic KASAN
~~~~~~~~~~~~~
-From a high level, our approach to memory error detection is similar to that
-of kmemcheck: use shadow memory to record whether each byte of memory is safe
-to access, and use compile-time instrumentation to insert checks of shadow
-memory on each memory access.
+From a high level perspective, KASAN's approach to memory error detection is
+similar to that of kmemcheck: use shadow memory to record whether each byte of
+memory is safe to access, and use compile-time instrumentation to insert checks
+of shadow memory on each memory access.
Generic KASAN dedicates 1/8th of kernel memory to its shadow memory (e.g. 16TB
to cover 128TB on x86_64) and uses direct mapping with a scale and offset to
@@ -194,20 +242,30 @@ Generic KASAN also reports the last 2 call stacks to creation of work that
potentially has access to an object. Call stacks for the following are shown:
call_rcu() and workqueue queuing.
+Generic KASAN is the only mode that delays the reuse of freed object via
+quarantine (see mm/kasan/quarantine.c for implementation).
+
Software tag-based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~
-Tag-based KASAN uses the Top Byte Ignore (TBI) feature of modern arm64 CPUs to
-store a pointer tag in the top byte of kernel pointers. Like generic KASAN it
-uses shadow memory to store memory tags associated with each 16-byte memory
+Software tag-based KASAN requires software memory tagging support in the form
+of HWASan-like compiler instrumentation (see HWASan documentation for details).
+
+Software tag-based KASAN is currently only implemented for arm64 architecture.
+
+Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs
+to store a pointer tag in the top byte of kernel pointers. Like generic KASAN
+it uses shadow memory to store memory tags associated with each 16-byte memory
cell (therefore it dedicates 1/16th of the kernel memory for shadow memory).
-On each memory allocation tag-based KASAN generates a random tag, tags the
-allocated memory with this tag, and embeds this tag into the returned pointer.
+On each memory allocation software tag-based KASAN generates a random tag, tags
+the allocated memory with this tag, and embeds this tag into the returned
+pointer.
+
Software tag-based KASAN uses compile-time instrumentation to insert checks
before each memory access. These checks make sure that tag of the memory that
is being accessed is equal to tag of the pointer that is used to access this
-memory. In case of a tag mismatch tag-based KASAN prints a bug report.
+memory. In case of a tag mismatch software tag-based KASAN prints a bug report.
Software tag-based KASAN also has two instrumentation modes (outline, that
emits callbacks to check memory accesses; and inline, that performs the shadow
@@ -216,9 +274,36 @@ simply printed from the function that performs the access check. With inline
instrumentation a brk instruction is emitted by the compiler, and a dedicated
brk handler is used to print bug reports.
-A potential expansion of this mode is a hardware tag-based mode, which would
-use hardware memory tagging support instead of compiler instrumentation and
-manual shadow memory manipulation.
+Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
+pointers with 0xFF pointer tag aren't checked). The value 0xFE is currently
+reserved to tag freed memory regions.
+
+Software tag-based KASAN currently only supports tagging of
+kmem_cache_alloc/kmalloc and page_alloc memory.
+
+Hardware tag-based KASAN
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Hardware tag-based KASAN is similar to the software mode in concept, but uses
+hardware memory tagging support instead of compiler instrumentation and
+shadow memory.
+
+Hardware tag-based KASAN is currently only implemented for arm64 architecture
+and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5
+Instruction Set Architecture, and Top Byte Ignore (TBI).
+
+Special arm64 instructions are used to assign memory tags for each allocation.
+Same tags are assigned to pointers to those allocations. On every memory
+access, hardware makes sure that tag of the memory that is being accessed is
+equal to tag of the pointer that is used to access this memory. In case of a
+tag mismatch a fault is generated and a report is printed.
+
+Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
+pointers with 0xFF pointer tag aren't checked). The value 0xFE is currently
+reserved to tag freed memory regions.
+
+Hardware tag-based KASAN currently only supports tagging of
+kmem_cache_alloc/kmalloc and page_alloc memory.
What memory accesses are sanitised by KASAN?
--------------------------------------------
@@ -265,17 +350,17 @@ Most mappings in vmalloc space are small, requiring less than a full
page of shadow space. Allocating a full shadow page per mapping would
therefore be wasteful. Furthermore, to ensure that different mappings
use different shadow pages, mappings would have to be aligned to
-``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``.
+``KASAN_GRANULE_SIZE * PAGE_SIZE``.
-Instead, we share backing space across multiple mappings. We allocate
+Instead, KASAN shares backing space across multiple mappings. It allocates
a backing page when a mapping in vmalloc space uses a particular page
of the shadow region. This page can be shared by other vmalloc
mappings later on.
-We hook in to the vmap infrastructure to lazily clean up unused shadow
+KASAN hooks into the vmap infrastructure to lazily clean up unused shadow
memory.
-To avoid the difficulties around swapping mappings around, we expect
+To avoid the difficulties around swapping mappings around, KASAN expects
that the part of the shadow region that covers the vmalloc space will
not be covered by the early shadow page, but will be left
unmapped. This will require changes in arch-specific code.
@@ -286,24 +371,31 @@ architectures that do not have a fixed module region.
CONFIG_KASAN_KUNIT_TEST & CONFIG_TEST_KASAN_MODULE
--------------------------------------------------
-``CONFIG_KASAN_KUNIT_TEST`` utilizes the KUnit Test Framework for testing.
-This means each test focuses on a small unit of functionality and
-there are a few ways these tests can be run.
+KASAN tests consist on two parts:
+
+1. Tests that are integrated with the KUnit Test Framework. Enabled with
+``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified
+automatically in a few different ways, see the instructions below.
-Each test will print the KASAN report if an error is detected and then
-print the number of the test and the status of the test:
+2. Tests that are currently incompatible with KUnit. Enabled with
+``CONFIG_TEST_KASAN_MODULE`` and can only be run as a module. These tests can
+only be verified manually, by loading the kernel module and inspecting the
+kernel log for KASAN reports.
-pass::
+Each KUnit-compatible KASAN test prints a KASAN report if an error is detected.
+Then the test prints its number and status.
+
+When a test passes::
ok 28 - kmalloc_double_kzfree
-or, if kmalloc failed::
+When a test fails due to a failed ``kmalloc``::
# kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
Expected ptr is not null, but is
not ok 4 - kmalloc_large_oob_right
-or, if a KASAN report was expected, but not found::
+When a test fails due to a missing KASAN report::
# kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:629
Expected kasan_data->report_expected == kasan_data->report_found, but
@@ -311,46 +403,38 @@ or, if a KASAN report was expected, but not found::
kasan_data->report_found == 0
not ok 28 - kmalloc_double_kzfree
-All test statuses are tracked as they run and an overall status will
-be printed at the end::
+At the end the cumulative status of all KASAN tests is printed. On success::
ok 1 - kasan
-or::
+Or, if one of the tests failed::
not ok 1 - kasan
-(1) Loadable Module
-~~~~~~~~~~~~~~~~~~~~
+
+There are a few ways to run KUnit-compatible KASAN tests.
+
+1. Loadable module
+~~~~~~~~~~~~~~~~~~
With ``CONFIG_KUNIT`` enabled, ``CONFIG_KASAN_KUNIT_TEST`` can be built as
-a loadable module and run on any architecture that supports KASAN
-using something like insmod or modprobe. The module is called ``test_kasan``.
+a loadable module and run on any architecture that supports KASAN by loading
+the module with insmod or modprobe. The module is called ``test_kasan``.
-(2) Built-In
-~~~~~~~~~~~~~
+2. Built-In
+~~~~~~~~~~~
With ``CONFIG_KUNIT`` built-in, ``CONFIG_KASAN_KUNIT_TEST`` can be built-in
-on any architecture that supports KASAN. These and any other KUnit
-tests enabled will run and print the results at boot as a late-init
-call.
+on any architecure that supports KASAN. These and any other KUnit tests enabled
+will run and print the results at boot as a late-init call.
-(3) Using kunit_tool
-~~~~~~~~~~~~~~~~~~~~~
+3. Using kunit_tool
+~~~~~~~~~~~~~~~~~~~
-With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, we can also
-use kunit_tool to see the results of these along with other KUnit
-tests in a more readable way. This will not print the KASAN reports
-of tests that passed. Use `KUnit documentation <https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html>`_ for more up-to-date
-information on kunit_tool.
+With ``CONFIG_KUNIT`` and ``CONFIG_KASAN_KUNIT_TEST`` built-in, it's also
+possible use ``kunit_tool`` to see the results of these and other KUnit tests
+in a more readable way. This will not print the KASAN reports of the tests that
+passed. Use `KUnit documentation <https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html>`_
+for more up-to-date information on ``kunit_tool``.
.. _KUnit: https://www.kernel.org/doc/html/latest/dev-tools/kunit/index.html
-
-``CONFIG_TEST_KASAN_MODULE`` is a set of KASAN tests that could not be
-converted to KUnit. These tests can be run only as a module with
-``CONFIG_TEST_KASAN_MODULE`` built as a loadable module and
-``CONFIG_KASAN`` built-in. The type of error expected and the
-function being run is printed before the expression expected to give
-an error. Then the error is printed, if found, and that test
-should be interpreted to pass only if the error was the one expected
-by the test.
diff --git a/arch/Kconfig b/arch/Kconfig
index d4bdc19ed3ad..6d9fe6e4af1a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,16 +960,16 @@ config VMAP_STACK
default y
bool "Use a virtually-mapped stack"
depends on HAVE_ARCH_VMAP_STACK
- depends on !KASAN || KASAN_VMALLOC
+ depends on !KASAN || KASAN_HW_TAGS || KASAN_VMALLOC
help
Enable this if you want the use virtually-mapped kernel stacks
with guard pages. This causes kernel stack overflows to be
caught immediately rather than causing difficult-to-diagnose
corruption.
- To use this with KASAN, the architecture must support backing
- virtual mappings with real shadow memory, and KASAN_VMALLOC must
- be enabled.
+ To use this with software KASAN modes, the architecture must support
+ backing virtual mappings with real shadow memory, and KASAN_VMALLOC
+ must be enabled.
config ARCH_OPTIONAL_KERNEL_RWX
def_bool n
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index c5cc5bfa2062..506e59a9ff87 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -481,3 +481,4 @@
549 common faccessat2 sys_faccessat2
550 common process_madvise sys_process_madvise
551 common watch_mount sys_watch_mount
+553 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/arm/boot/compressed/string.c b/arch/arm/boot/compressed/string.c
index 8c0fa276d994..cc6198f8a348 100644
--- a/arch/arm/boot/compressed/string.c
+++ b/arch/arm/boot/compressed/string.c
@@ -21,9 +21,9 @@
#undef memcpy
#undef memmove
#undef memset
-void *__memcpy(void *__dest, __const void *__src, size_t __n) __alias(memcpy);
-void *__memmove(void *__dest, __const void *__src, size_t count) __alias(memmove);
-void *__memset(void *s, int c, size_t count) __alias(memset);
+void *__memcpy(void *__dest, __const void *__src, size_t __n) __alias("memcpy");
+void *__memmove(void *__dest, __const void *__src, size_t count) __alias("memmove");
+void *__memset(void *s, int c, size_t count) __alias("memset");
#endif
void *memcpy(void *__dest, __const void *__src, size_t __n)
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 47325b3b661a..dbde88a855b6 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -455,3 +455,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index cc1ad4fda6a1..33446269f692 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -137,6 +137,7 @@ config ARM64
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN
+ select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE)
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
@@ -335,7 +336,7 @@ config BROKEN_GAS_INST
config KASAN_SHADOW_OFFSET
hex
- depends on KASAN
+ depends on KASAN_GENERIC || KASAN_SW_TAGS
default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS
default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS
default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS
@@ -1572,6 +1573,9 @@ endmenu
menu "ARMv8.5 architectural features"
+config AS_HAS_ARMV8_5
+ def_bool $(cc-option,-Wa$(comma)-march=armv8.5-a)
+
config ARM64_BTI
bool "Branch Target Identification support"
default y
@@ -1646,6 +1650,9 @@ config ARM64_MTE
bool "Memory Tagging Extension support"
default y
depends on ARM64_AS_HAS_MTE && ARM64_TAGGED_ADDR_ABI
+ depends on AS_HAS_ARMV8_5
+ # Required for tag checking in the uaccess routines
+ depends on ARM64_PAN
select ARCH_USES_HIGH_VMA_FLAGS
help
Memory Tagging (part of the ARMv8.5 Extensions) provides
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 6a87d592bd00..6be9b3750250 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -96,6 +96,11 @@ ifeq ($(CONFIG_AS_HAS_ARMV8_4), y)
asm-arch := armv8.4-a
endif
+ifeq ($(CONFIG_AS_HAS_ARMV8_5), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.5-a
+endif
+
ifdef asm-arch
KBUILD_CFLAGS += -Wa,-march=$(asm-arch) \
-DARM64_ASM_ARCH='"$(asm-arch)"'
@@ -132,7 +137,7 @@ head-y := arch/arm64/kernel/head.o
ifeq ($(CONFIG_KASAN_SW_TAGS), y)
KASAN_SHADOW_SCALE_SHIFT := 4
-else
+else ifeq ($(CONFIG_KASAN_GENERIC), y)
KASAN_SHADOW_SCALE_SHIFT := 3
endif
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index ff9cbb631212..4306136ef329 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -4,5 +4,4 @@ generic-y += local64.h
generic-y += mcs_spinlock.h
generic-y += qrwlock.h
generic-y += qspinlock.h
-generic-y += set_memory.h
generic-y += user.h
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ddbe6bf00e33..bf125c591116 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -473,7 +473,7 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
#define NOKPROBE(x)
#endif
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define EXPORT_SYMBOL_NOKASAN(name)
#else
#define EXPORT_SYMBOL_NOKASAN(name) EXPORT_SYMBOL(name)
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 63d43b5f82f6..77cbbe3625f2 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -6,6 +6,7 @@
#define __ASM_CACHE_H
#include <asm/cputype.h>
+#include <asm/mte-kasan.h>
#define CTR_L1IP_SHIFT 14
#define CTR_L1IP_MASK 3
@@ -51,6 +52,8 @@
#ifdef CONFIG_KASAN_SW_TAGS
#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
+#elif defined(CONFIG_KASAN_HW_TAGS)
+#define ARCH_SLAB_MINALIGN MTE_GRANULE_SIZE
#endif
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 45217f21f1fe..b1bdf83a73db 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -136,12 +136,6 @@ static __always_inline void __flush_icache_all(void)
dsb(ish);
}
-int set_memory_valid(unsigned long addr, int numpages, int enable);
-
-int set_direct_map_invalid_noflush(struct page *page);
-int set_direct_map_default_noflush(struct page *page);
-bool kernel_page_present(struct page *page);
-
#include <asm-generic/cacheflush.h>
#endif /* __ASM_CACHEFLUSH_H */
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 85a3e49f92f4..29f97eb3dad4 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -106,6 +106,7 @@
#define ESR_ELx_FSC_TYPE (0x3C)
#define ESR_ELx_FSC_LEVEL (0x03)
#define ESR_ELx_FSC_EXTABT (0x10)
+#define ESR_ELx_FSC_MTE (0x11)
#define ESR_ELx_FSC_SERROR (0x11)
#define ESR_ELx_FSC_ACCESS (0x08)
#define ESR_ELx_FSC_FAULT (0x04)
diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
index b0dc4abc3589..0aaf9044cd6a 100644
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -12,7 +12,9 @@
#define arch_kasan_reset_tag(addr) __tag_reset(addr)
#define arch_kasan_get_tag(addr) __tag_get(addr)
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
+void kasan_init(void);
/*
* KASAN_SHADOW_START: beginning of the kernel virtual addresses.
@@ -33,7 +35,6 @@
#define _KASAN_SHADOW_START(va) (KASAN_SHADOW_END - (1UL << ((va) - KASAN_SHADOW_SCALE_SHIFT)))
#define KASAN_SHADOW_START _KASAN_SHADOW_START(vabits_actual)
-void kasan_init(void);
void kasan_copy_shadow(pgd_t *pgdir);
asmlinkage void kasan_early_init(void);
diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
index 6c0afeeab635..c44bb368a810 100644
--- a/arch/arm64/include/asm/kfence.h
+++ b/arch/arm64/include/asm/kfence.h
@@ -3,7 +3,7 @@
#ifndef __ASM_KFENCE_H
#define __ASM_KFENCE_H
-#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
static inline bool arch_kfence_init_pool(void) { return true; }
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 556cb2d62b5b..18fce223b67b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -72,7 +72,7 @@
* address space for the shadow region respectively. They can bloat the stack
* significantly, so double the (minimum) stack size when they are in use.
*/
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
#define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
+ KASAN_SHADOW_OFFSET)
@@ -214,7 +214,7 @@ static inline unsigned long kaslr_offset(void)
(__force __typeof__(addr))__addr; \
})
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
#define __tag_shifted(tag) ((u64)(tag) << 56)
#define __tag_reset(addr) __untagged_addr(addr)
#define __tag_get(addr) (__u8)((u64)(addr) >> 56)
@@ -222,7 +222,7 @@ static inline unsigned long kaslr_offset(void)
#define __tag_shifted(tag) 0UL
#define __tag_reset(addr) (addr)
#define __tag_get(addr) 0
-#endif /* CONFIG_KASAN_SW_TAGS */
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
static inline const void *__tag_set(const void *addr, u8 tag)
{
@@ -230,6 +230,15 @@ static inline const void *__tag_set(const void *addr, u8 tag)
return (const void *)(__addr | __tag_shifted(tag));
}
+#ifdef CONFIG_KASAN_HW_TAGS
+#define arch_enable_tagging() mte_enable_kernel()
+#define arch_init_tags(max_tag) mte_init_tags(max_tag)
+#define arch_get_random_tag() mte_get_random_tag()
+#define arch_get_mem_tag(addr) mte_get_mem_tag(addr)
+#define arch_set_mem_tag_range(addr, size, tag) \
+ mte_set_mem_tag_range((addr), (size), (tag))
+#endif /* CONFIG_KASAN_HW_TAGS */
+
/*
* Physical vs virtual RAM address space conversion. These are
* private definitions which should NOT be used outside memory.h
diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
new file mode 100644
index 000000000000..2d73a1612f09
--- /dev/null
+++ b/arch/arm64/include/asm/mte-def.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#ifndef __ASM_MTE_DEF_H
+#define __ASM_MTE_DEF_H
+
+#define MTE_GRANULE_SIZE UL(16)
+#define MTE_GRANULE_MASK (~(MTE_GRANULE_SIZE - 1))
+#define MTE_TAG_SHIFT 56
+#define MTE_TAG_SIZE 4
+#define MTE_TAG_MASK GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
+
+#endif /* __ASM_MTE_DEF_H */
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
new file mode 100644
index 000000000000..26349a4b5e2e
--- /dev/null
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#ifndef __ASM_MTE_KASAN_H
+#define __ASM_MTE_KASAN_H
+
+#include <asm/mte-def.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * The functions below are meant to be used only for the
+ * KASAN_HW_TAGS interface defined in asm/memory.h.
+ */
+#ifdef CONFIG_ARM64_MTE
+
+static inline u8 mte_get_ptr_tag(void *ptr)
+{
+ /* Note: The format of KASAN tags is 0xF<x> */
+ u8 tag = 0xF0 | (u8)(((u64)(ptr)) >> MTE_TAG_SHIFT);
+
+ return tag;
+}
+
+u8 mte_get_mem_tag(void *addr);
+u8 mte_get_random_tag(void);
+void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag);
+
+void mte_enable_kernel(void);
+void mte_init_tags(u64 max_tag);
+
+#else /* CONFIG_ARM64_MTE */
+
+static inline u8 mte_get_ptr_tag(void *ptr)
+{
+ return 0xFF;
+}
+
+static inline u8 mte_get_mem_tag(void *addr)
+{
+ return 0xFF;
+}
+static inline u8 mte_get_random_tag(void)
+{
+ return 0xFF;
+}
+static inline void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag)
+{
+ return addr;
+}
+
+static inline void mte_enable_kernel(void)
+{
+}
+
+static inline void mte_init_tags(u64 max_tag)
+{
+}
+
+#endif /* CONFIG_ARM64_MTE */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_MTE_KASAN_H */
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 1c99fcadb58c..d02aff9f493d 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -5,17 +5,21 @@
#ifndef __ASM_MTE_H
#define __ASM_MTE_H
-#define MTE_GRANULE_SIZE UL(16)
-#define MTE_GRANULE_MASK (~(MTE_GRANULE_SIZE - 1))
-#define MTE_TAG_SHIFT 56
-#define MTE_TAG_SIZE 4
+#include <asm/compiler.h>
+#include <asm/mte-def.h>
+
+#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension memtag\n"
#ifndef __ASSEMBLY__
+#include <linux/bitfield.h>
#include <linux/page-flags.h>
+#include <linux/types.h>
#include <asm/pgtable-types.h>
+extern u64 gcr_kernel_excl;
+
void mte_clear_page_tags(void *addr);
unsigned long mte_copy_tags_from_user(void *to, const void __user *from,
unsigned long n);
@@ -45,7 +49,9 @@ long get_mte_ctrl(struct task_struct *task);
int mte_ptrace_copy_tags(struct task_struct *child, long request,
unsigned long addr, unsigned long data);
-#else
+void mte_assign_mem_tag_range(void *addr, size_t size);
+
+#else /* CONFIG_ARM64_MTE */
/* unused if !CONFIG_ARM64_MTE, silence the compiler */
#define PG_mte_tagged 0
@@ -80,7 +86,11 @@ static inline int mte_ptrace_copy_tags(struct task_struct *child,
return -EIO;
}
-#endif
+static inline void mte_assign_mem_tag_range(void *addr, size_t size)
+{
+}
+
+#endif /* CONFIG_ARM64_MTE */
#endif /* __ASSEMBLY__ */
#endif /* __ASM_MTE_H */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 724249f37af5..ca2cd75d3286 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -152,7 +152,7 @@ struct thread_struct {
#endif
#ifdef CONFIG_ARM64_MTE
u64 sctlr_tcf0;
- u64 gcr_user_incl;
+ u64 gcr_user_excl;
#endif
};
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
new file mode 100644
index 000000000000..ecb6b0f449ab
--- /dev/null
+++ b/arch/arm64/include/asm/set_memory.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ASM_ARM64_SET_MEMORY_H
+#define _ASM_ARM64_SET_MEMORY_H
+
+#include <asm-generic/set_memory.h>
+
+bool can_set_direct_map(void);
+#define can_set_direct_map can_set_direct_map
+
+int set_memory_valid(unsigned long addr, int numpages, int enable);
+
+int set_direct_map_invalid_noflush(struct page *page, int numpages);
+int set_direct_map_default_noflush(struct page *page, int numpages);
+bool kernel_page_present(struct page *page);
+
+#endif /* _ASM_ARM64_SET_MEMORY_H */
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index b31e8e87a0db..3a3264ff47b9 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -5,7 +5,7 @@
#ifndef __ASM_STRING_H
#define __ASM_STRING_H
-#ifndef CONFIG_KASAN
+#if !(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
#define __HAVE_ARCH_STRRCHR
extern char *strrchr(const char *, int c);
@@ -48,7 +48,8 @@ extern void *__memset(void *, int, __kernel_size_t);
void memcpy_flushcache(void *dst, const void *src, size_t cnt);
#endif
-#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+ !defined(__SANITIZE_ADDRESS__)
/*
* For files that are not instrumented (e.g. mm/slub.c) we
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index abb31aa1f8ca..6f986e09a781 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -159,8 +159,28 @@ static inline void __uaccess_enable_hw_pan(void)
CONFIG_ARM64_PAN));
}
+/*
+ * The Tag Check Flag (TCF) mode for MTE is per EL, hence TCF0
+ * affects EL0 and TCF affects EL1 irrespective of which TTBR is
+ * used.
+ * The kernel accesses TTBR0 usually with LDTR/STTR instructions
+ * when UAO is available, so these would act as EL0 accesses using
+ * TCF0.
+ * However futex.h code uses exclusives which would be executed as
+ * EL1, this can potentially cause a tag check fault even if the
+ * user disables TCF0.
+ *
+ * To address the problem we set the PSTATE.TCO bit in uaccess_enable()
+ * and reset it in uaccess_disable().
+ *
+ * The Tag check override (TCO) bit disables temporarily the tag checking
+ * preventing the issue.
+ */
static inline void uaccess_disable_privileged(void)
{
+ asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(0),
+ ARM64_MTE, CONFIG_KASAN_HW_TAGS));
+
if (uaccess_ttbr0_disable())
return;
@@ -169,6 +189,9 @@ static inline void uaccess_disable_privileged(void)
static inline void uaccess_enable_privileged(void)
{
+ asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(1),
+ ARM64_MTE, CONFIG_KASAN_HW_TAGS));
+
if (uaccess_ttbr0_enable())
return;
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 86a9d7b3eabe..949788f5ba40 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 442
+#define __NR_compat_syscalls 443
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 6c1dcca067e0..091d91028982 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
__SYSCALL(__NR_process_madvise, sys_process_madvise)
#define __NR_watch_mount 441
__SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_epoll_pwait2 442
+__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h
index f83a70e07df8..ce2ee8f1e361 100644
--- a/arch/arm64/include/uapi/asm/unistd.h
+++ b/arch/arm64/include/uapi/asm/unistd.h
@@ -20,5 +20,6 @@
#define __ARCH_WANT_SET_GET_RLIMIT
#define __ARCH_WANT_TIME32_SYSCALLS
#define __ARCH_WANT_SYS_CLONE3
+#define __ARCH_WANT_MEMFD_SECRET
#include <asm-generic/unistd.h>
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 5e82488f1b82..f42fd9e33981 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -47,6 +47,9 @@ int main(void)
DEFINE(THREAD_KEYS_USER, offsetof(struct task_struct, thread.keys_user));
DEFINE(THREAD_KEYS_KERNEL, offsetof(struct task_struct, thread.keys_kernel));
#endif
+#ifdef CONFIG_ARM64_MTE
+ DEFINE(THREAD_GCR_EL1_USER, offsetof(struct task_struct, thread.gcr_user_excl));
+#endif
BLANK();
DEFINE(S_X0, offsetof(struct pt_regs, regs[0]));
DEFINE(S_X2, offsetof(struct pt_regs, regs[2]));
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d87cfc6246e0..7ffb5f1d8b68 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -70,6 +70,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/kasan.h>
#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
@@ -1710,6 +1711,8 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
cleared_zero_page = true;
mte_clear_page_tags(lm_alias(empty_zero_page));
}
+
+ kasan_init_hw_tags_cpu();
}
#endif /* CONFIG_ARM64_MTE */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 51c762156099..2a93fa5f4e49 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -173,6 +173,43 @@ alternative_else_nop_endif
#endif
.endm
+ .macro mte_set_gcr, tmp, tmp2
+#ifdef CONFIG_ARM64_MTE
+ /*
+ * Calculate and set the exclude mask preserving
+ * the RRND (bit[16]) setting.
+ */
+ mrs_s \tmp2, SYS_GCR_EL1
+ bfi \tmp2, \tmp, #0, #16
+ msr_s SYS_GCR_EL1, \tmp2
+ isb
+#endif
+ .endm
+
+ .macro mte_set_kernel_gcr, tmp, tmp2
+#ifdef CONFIG_KASAN_HW_TAGS
+alternative_if_not ARM64_MTE
+ b 1f
+alternative_else_nop_endif
+ ldr_l \tmp, gcr_kernel_excl
+
+ mte_set_gcr \tmp, \tmp2
+1:
+#endif
+ .endm
+
+ .macro mte_set_user_gcr, tsk, tmp, tmp2
+#ifdef CONFIG_ARM64_MTE
+alternative_if_not ARM64_MTE
+ b 1f
+alternative_else_nop_endif
+ ldr \tmp, [\tsk, #THREAD_GCR_EL1_USER]
+
+ mte_set_gcr \tmp, \tmp2
+1:
+#endif
+ .endm
+
.macro kernel_entry, el, regsize = 64
.if \regsize == 32
mov w0, w0 // zero upper 32 bits of x0
@@ -212,6 +249,8 @@ alternative_else_nop_endif
ptrauth_keys_install_kernel tsk, x20, x22, x23
+ mte_set_kernel_gcr x22, x23
+
scs_load tsk, x20
.else
add x21, sp, #S_FRAME_SIZE
@@ -315,6 +354,8 @@ alternative_else_nop_endif
/* No kernel C function calls after this as user keys are set. */
ptrauth_keys_install_user tsk, x0, x1, x2
+ mte_set_user_gcr tsk, x0, x1
+
apply_ssbd 0, x0, x1
.endif
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 42b23ce679dc..a0dc987724ed 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -433,7 +433,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
bl __pi_memset
dsb ishst // Make zero page visible to PTW
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
bl kasan_early_init
#endif
#ifdef CONFIG_RANDOMIZE_BASE
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 42003774d261..9c9f47e9f7f4 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -371,6 +371,11 @@ static void swsusp_mte_restore_tags(void)
unsigned long pfn = xa_state.xa_index;
struct page *page = pfn_to_online_page(pfn);
+ /*
+ * It is not required to invoke page_kasan_tag_reset(page)
+ * at this point since the tags stored in page->flags are
+ * already restored.
+ */
mte_restore_page_tags(page_address(page), tags);
mte_free_tag_storage(tags);
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 39289d75118d..f676243abac6 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -37,7 +37,7 @@ __efistub_strncmp = __pi_strncmp;
__efistub_strrchr = __pi_strrchr;
__efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
__efistub___memcpy = __pi_memcpy;
__efistub___memmove = __pi_memmove;
__efistub___memset = __pi_memset;
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 0921aa1520b0..1c74c45b9494 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -161,7 +161,8 @@ u64 __init kaslr_early_init(u64 dt_phys)
/* use the top 16 bits to randomize the linear region */
memstart_offset_seed = seed >> 48;
- if (IS_ENABLED(CONFIG_KASAN))
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
/*
* KASAN does not expect the module region to intersect the
* vmalloc region, since shadow memory is allocated for each
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144cfaea7..0cbc50c4fa5a 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/kexec.h>
#include <linux/page-flags.h>
+#include <linux/set_memory.h>
#include <linux/smp.h>
#include <asm/cacheflush.h>
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 2a1ad95d9b2c..fe21e0f06492 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -30,7 +30,8 @@ void *module_alloc(unsigned long size)
if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
gfp_mask |= __GFP_NOWARN;
- if (IS_ENABLED(CONFIG_KASAN))
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
/* don't exceed the static module region - see below */
module_alloc_end = MODULES_END;
@@ -39,7 +40,8 @@ void *module_alloc(unsigned long size)
NUMA_NO_NODE, __builtin_return_address(0));
if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
- !IS_ENABLED(CONFIG_KASAN))
+ !IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ !IS_ENABLED(CONFIG_KASAN_SW_TAGS))
/*
* KASAN can only deal with module allocations being served
* from the reserved module region, since the remainder of
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index ef15c8a2a49d..dc9ada64feed 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -13,13 +13,18 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/thread_info.h>
+#include <linux/types.h>
#include <linux/uio.h>
+#include <asm/barrier.h>
#include <asm/cpufeature.h>
#include <asm/mte.h>
+#include <asm/mte-kasan.h>
#include <asm/ptrace.h>
#include <asm/sysreg.h>
+u64 gcr_kernel_excl __ro_after_init;
+
static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
{
pte_t old_pte = READ_ONCE(*ptep);
@@ -31,6 +36,15 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
return;
}
+ page_kasan_tag_reset(page);
+ /*
+ * We need smp_wmb() in between setting the flags and clearing the
+ * tags because if another thread reads page->flags and builds a
+ * tagged address out of it, there is an actual dependency to the
+ * memory access, but on the current thread we do not guarantee that
+ * the new page->flags are visible before the tags were updated.
+ */
+ smp_wmb();
mte_clear_page_tags(page_address(page));
}
@@ -72,6 +86,78 @@ int memcmp_pages(struct page *page1, struct page *page2)
return ret;
}
+u8 mte_get_mem_tag(void *addr)
+{
+ if (!system_supports_mte())
+ return 0xFF;
+
+ asm(__MTE_PREAMBLE "ldg %0, [%0]"
+ : "+r" (addr));
+
+ return mte_get_ptr_tag(addr);
+}
+
+u8 mte_get_random_tag(void)
+{
+ void *addr;
+
+ if (!system_supports_mte())
+ return 0xFF;
+
+ asm(__MTE_PREAMBLE "irg %0, %0"
+ : "+r" (addr));
+
+ return mte_get_ptr_tag(addr);
+}
+
+void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag)
+{
+ void *ptr = addr;
+
+ if ((!system_supports_mte()) || (size == 0))
+ return addr;
+
+ /* Make sure that size is MTE granule aligned. */
+ WARN_ON(size & (MTE_GRANULE_SIZE - 1));
+
+ /* Make sure that the address is MTE granule aligned. */
+ WARN_ON((u64)addr & (MTE_GRANULE_SIZE - 1));
+
+ tag = 0xF0 | tag;
+ ptr = (void *)__tag_set(ptr, tag);
+
+ mte_assign_mem_tag_range(ptr, size);
+
+ return ptr;
+}
+
+void mte_init_tags(u64 max_tag)
+{
+ static bool gcr_kernel_excl_initialized;
+
+ if (!gcr_kernel_excl_initialized) {
+ /*
+ * The format of the tags in KASAN is 0xFF and in MTE is 0xF.
+ * This conversion extracts an MTE tag from a KASAN tag.
+ */
+ u64 incl = GENMASK(FIELD_GET(MTE_TAG_MASK >> MTE_TAG_SHIFT,
+ max_tag), 0);
+
+ gcr_kernel_excl = ~incl & SYS_GCR_EL1_EXCL_MASK;
+ gcr_kernel_excl_initialized = true;
+ }
+
+ /* Enable the kernel exclude mask for random tags generation. */
+ write_sysreg_s(SYS_GCR_EL1_RRND | gcr_kernel_excl, SYS_GCR_EL1);
+}
+
+void mte_enable_kernel(void)
+{
+ /* Enable MTE Sync Mode for EL1. */
+ sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_SYNC);
+ isb();
+}
+
static void update_sctlr_el1_tcf0(u64 tcf0)
{
/* ISB required for the kernel uaccess routines */
@@ -92,23 +178,26 @@ static void set_sctlr_el1_tcf0(u64 tcf0)
preempt_enable();
}
-static void update_gcr_el1_excl(u64 incl)
+static void update_gcr_el1_excl(u64 excl)
{
- u64 excl = ~incl & SYS_GCR_EL1_EXCL_MASK;
/*
- * Note that 'incl' is an include mask (controlled by the user via
- * prctl()) while GCR_EL1 accepts an exclude mask.
+ * Note that the mask controlled by the user via prctl() is an
+ * include while GCR_EL1 accepts an exclude mask.
* No need for ISB since this only affects EL0 currently, implicit
* with ERET.
*/
sysreg_clear_set_s(SYS_GCR_EL1, SYS_GCR_EL1_EXCL_MASK, excl);
}
-static void set_gcr_el1_excl(u64 incl)
+static void set_gcr_el1_excl(u64 excl)
{
- current->thread.gcr_user_incl = incl;
- update_gcr_el1_excl(incl);
+ current->thread.gcr_user_excl = excl;
+
+ /*
+ * SYS_GCR_EL1 will be set to current->thread.gcr_user_excl value
+ * by mte_set_user_gcr() in kernel_exit,
+ */
}
void flush_mte_state(void)
@@ -123,7 +212,7 @@ void flush_mte_state(void)
/* disable tag checking */
set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE);
/* reset tag generation mask */
- set_gcr_el1_excl(0);
+ set_gcr_el1_excl(SYS_GCR_EL1_EXCL_MASK);
}
void mte_thread_switch(struct task_struct *next)
@@ -134,7 +223,6 @@ void mte_thread_switch(struct task_struct *next)
/* avoid expensive SCTLR_EL1 accesses if no change */
if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0)
update_sctlr_el1_tcf0(next->thread.sctlr_tcf0);
- update_gcr_el1_excl(next->thread.gcr_user_incl);
}
void mte_suspend_exit(void)
@@ -142,13 +230,14 @@ void mte_suspend_exit(void)
if (!system_supports_mte())
return;
- update_gcr_el1_excl(current->thread.gcr_user_incl);
+ update_gcr_el1_excl(gcr_kernel_excl);
}
long set_mte_ctrl(struct task_struct *task, unsigned long arg)
{
u64 tcf0;
- u64 gcr_incl = (arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT;
+ u64 gcr_excl = ~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) &
+ SYS_GCR_EL1_EXCL_MASK;
if (!system_supports_mte())
return 0;
@@ -169,10 +258,10 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg)
if (task != current) {
task->thread.sctlr_tcf0 = tcf0;
- task->thread.gcr_user_incl = gcr_incl;
+ task->thread.gcr_user_excl = gcr_excl;
} else {
set_sctlr_el1_tcf0(tcf0);
- set_gcr_el1_excl(gcr_incl);
+ set_gcr_el1_excl(gcr_excl);
}
return 0;
@@ -181,11 +270,12 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg)
long get_mte_ctrl(struct task_struct *task)
{
unsigned long ret;
+ u64 incl = ~task->thread.gcr_user_excl & SYS_GCR_EL1_EXCL_MASK;
if (!system_supports_mte())
return 0;
- ret = task->thread.gcr_user_incl << PR_MTE_TAG_SHIFT;
+ ret = incl << PR_MTE_TAG_SHIFT;
switch (task->thread.sctlr_tcf0) {
case SCTLR_EL1_TCF0_NONE:
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c44eb4b80163..c18aacde8bb0 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -358,7 +358,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
smp_build_mpidr_hash();
/* Init percpu seeds for random tags after cpus are set up. */
- kasan_init_tags();
+ kasan_init_sw_tags();
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 4be7f7eed875..6bdef7362c0e 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -133,7 +133,7 @@ SYM_FUNC_START(_cpu_resume)
*/
bl cpu_do_resume
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
mov x0, sp
bl kasan_unpoison_task_stack_below
#endif
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 2499b895efea..19b1705ae5cb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -462,6 +462,8 @@ void __init smp_prepare_boot_cpu(void)
/* Conditionally switch to GIC PMR for interrupt masking */
if (system_uses_irq_prio_masking())
init_gic_priority_masking();
+
+ kasan_init_hw_tags();
}
static u64 __init of_get_cpu_mpidr(struct device_node *dn)
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 351537c12f36..9e1a12e10053 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -149,3 +149,19 @@ SYM_FUNC_START(mte_restore_page_tags)
ret
SYM_FUNC_END(mte_restore_page_tags)
+
+/*
+ * Assign allocation tags for a region of memory based on the pointer tag
+ * x0 - source pointer
+ * x1 - size
+ *
+ * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
+ * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ */
+SYM_FUNC_START(mte_assign_mem_tag_range)
+1: stg x0, [x0]
+ add x0, x0, #MTE_GRANULE_SIZE
+ subs x1, x1, #MTE_GRANULE_SIZE
+ b.gt 1b
+ ret
+SYM_FUNC_END(mte_assign_mem_tag_range)
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 70a71f38b6a9..b5447e53cd73 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -23,6 +23,15 @@ void copy_highpage(struct page *to, struct page *from)
if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
set_bit(PG_mte_tagged, &to->flags);
+ page_kasan_tag_reset(to);
+ /*
+ * We need smp_wmb() in between setting the flags and clearing the
+ * tags because if another thread reads page->flags and builds a
+ * tagged address out of it, there is an actual dependency to the
+ * memory access, but on the current thread we do not guarantee that
+ * the new page->flags are visible before the tags were updated.
+ */
+ smp_wmb();
mte_copy_page_tags(kto, kfrom);
}
}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 677ce434d3be..7ab718603f4b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/hardirq.h>
#include <linux/init.h>
+#include <linux/kasan.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/page-flags.h>
@@ -34,6 +35,7 @@
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/kprobes.h>
+#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/sysreg.h>
#include <asm/system_misc.h>
@@ -297,6 +299,57 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
do_exit(SIGKILL);
}
+#ifdef CONFIG_KASAN_HW_TAGS
+static void report_tag_fault(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs)
+{
+ bool is_write = ((esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT) != 0;
+
+ /*
+ * SAS bits aren't set for all faults reported in EL1, so we can't
+ * find out access size.
+ */
+ kasan_report(addr, 0, is_write, regs->pc);
+}
+#else
+/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
+static inline void report_tag_fault(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs) { }
+#endif
+
+static void do_tag_recovery(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs)
+{
+ static bool reported;
+
+ if (!READ_ONCE(reported)) {
+ report_tag_fault(addr, esr, regs);
+ WRITE_ONCE(reported, true);
+ }
+
+ /*
+ * Disable MTE Tag Checking on the local CPU for the current EL.
+ * It will be done lazily on the other CPUs when they will hit a
+ * tag fault.
+ */
+ sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE);
+ isb();
+}
+
+static bool is_el1_mte_sync_tag_check_fault(unsigned int esr)
+{
+ unsigned int ec = ESR_ELx_EC(esr);
+ unsigned int fsc = esr & ESR_ELx_FSC;
+
+ if (ec != ESR_ELx_EC_DABT_CUR)
+ return false;
+
+ if (fsc == ESR_ELx_FSC_MTE)
+ return true;
+
+ return false;
+}
+
static void __do_kernel_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
@@ -313,6 +366,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
"Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
return;
+ if (is_el1_mte_sync_tag_check_fault(esr)) {
+ do_tag_recovery(addr, esr, regs);
+
+ return;
+ }
+
if (is_el1_permission_fault(addr, esr, regs)) {
if (esr & ESR_ELx_WNR)
msg = "write to read-only memory";
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index b24e43d20667..d8e66c78440e 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -21,6 +21,8 @@
#include <asm/sections.h>
#include <asm/tlbflush.h>
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
/*
@@ -208,7 +210,7 @@ static void __init clear_pgds(unsigned long start,
set_pgd(pgd_offset_k(start), __pgd(0));
}
-void __init kasan_init(void)
+static void __init kasan_init_shadow(void)
{
u64 kimg_shadow_start, kimg_shadow_end;
u64 mod_shadow_start, mod_shadow_end;
@@ -269,8 +271,21 @@ void __init kasan_init(void)
memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+}
- /* At this point kasan is fully initialized. Enable error messages */
+static void __init kasan_init_depth(void)
+{
init_task.kasan_depth = 0;
+}
+
+void __init kasan_init(void)
+{
+ kasan_init_shadow();
+ kasan_init_depth();
+#if defined(CONFIG_KASAN_GENERIC)
+ /* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */
pr_info("KernelAddressSanitizer initialized\n");
+#endif
}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 30c6dd02e706..79604049fff5 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -22,6 +22,7 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
@@ -492,7 +493,7 @@ static void __init map_mem(pgd_t *pgdp)
int flags = 0;
u64 i;
- if (rodata_full || crash_mem_map || debug_pagealloc_enabled())
+ if (can_set_direct_map() || crash_mem_map)
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -1468,8 +1469,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
* KFENCE requires linear map to be mapped at page granularity, so that
* it is possible to protect/unprotect single pages in the KFENCE pool.
*/
- if (rodata_full || debug_pagealloc_enabled() ||
- IS_ENABLED(CONFIG_KFENCE))
+ if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index c52c1847079c..7c4ef56265ee 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -53,6 +53,15 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page)
if (!tags)
return false;
+ page_kasan_tag_reset(page);
+ /*
+ * We need smp_wmb() in between setting the flags and clearing the
+ * tags because if another thread reads page->flags and builds a
+ * tagged address out of it, there is an actual dependency to the
+ * memory access, but on the current thread we do not guarantee that
+ * the new page->flags are visible before the tags were updated.
+ */
+ smp_wmb();
mte_restore_page_tags(page_address(page), tags);
return true;
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 92eccaf595c8..d505172265b0 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -19,6 +19,11 @@ struct page_change_data {
bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
+bool can_set_direct_map(void)
+{
+ return rodata_full || debug_pagealloc_enabled();
+}
+
static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
{
struct page_change_data *cdata = data;
@@ -148,40 +153,42 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
__pgprot(PTE_VALID));
}
-int set_direct_map_invalid_noflush(struct page *page)
+int set_direct_map_invalid_noflush(struct page *page, int numpages)
{
struct page_change_data data = {
.set_mask = __pgprot(0),
.clear_mask = __pgprot(PTE_VALID),
};
+ unsigned long size = PAGE_SIZE * numpages;
- if (!debug_pagealloc_enabled() && !rodata_full)
+ if (!can_set_direct_map())
return 0;
return apply_to_page_range(&init_mm,
(unsigned long)page_address(page),
- PAGE_SIZE, change_page_range, &data);
+ size, change_page_range, &data);
}
-int set_direct_map_default_noflush(struct page *page)
+int set_direct_map_default_noflush(struct page *page, int numpages)
{
struct page_change_data data = {
.set_mask = __pgprot(PTE_VALID | PTE_WRITE),
.clear_mask = __pgprot(PTE_RDONLY),
};
+ unsigned long size = PAGE_SIZE * numpages;
- if (!debug_pagealloc_enabled() && !rodata_full)
+ if (!can_set_direct_map())
return 0;
return apply_to_page_range(&init_mm,
(unsigned long)page_address(page),
- PAGE_SIZE, change_page_range, &data);
+ size, change_page_range, &data);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
- if (!debug_pagealloc_enabled() && !rodata_full)
+ if (!can_set_direct_map())
return;
set_memory_valid((unsigned long)page_address(page), numpages, enable);
@@ -206,7 +213,7 @@ bool kernel_page_present(struct page *page)
pte_t *ptep;
unsigned long addr = (unsigned long)page_address(page);
- if (!debug_pagealloc_enabled() && !rodata_full)
+ if (!can_set_direct_map())
return true;
pgdp = pgd_offset_k(addr);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index a0831bf8a018..37a54b57178a 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -40,9 +40,15 @@
#define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA
#ifdef CONFIG_KASAN_SW_TAGS
-#define TCR_KASAN_FLAGS TCR_TBI1 | TCR_TBID1
+#define TCR_KASAN_SW_FLAGS TCR_TBI1 | TCR_TBID1
#else
-#define TCR_KASAN_FLAGS 0
+#define TCR_KASAN_SW_FLAGS 0
+#endif
+
+#ifdef CONFIG_KASAN_HW_TAGS
+#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1
+#else
+#define TCR_KASAN_HW_FLAGS 0
#endif
/*
@@ -427,6 +433,10 @@ SYM_FUNC_START(__cpu_setup)
*/
mov_q x5, MAIR_EL1_SET
#ifdef CONFIG_ARM64_MTE
+ mte_tcr .req x20
+
+ mov mte_tcr, #0
+
/*
* Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported
* (ID_AA64PFR1_EL1[11:8] > 1).
@@ -447,6 +457,9 @@ SYM_FUNC_START(__cpu_setup)
/* clear any pending tag check faults in TFSR*_EL1 */
msr_s SYS_TFSR_EL1, xzr
msr_s SYS_TFSRE0_EL1, xzr
+
+ /* set the TCR_EL1 bits */
+ mov_q mte_tcr, TCR_KASAN_HW_FLAGS
1:
#endif
msr mair_el1, x5
@@ -456,7 +469,11 @@ SYM_FUNC_START(__cpu_setup)
*/
mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
- TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
+ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS
+#ifdef CONFIG_ARM64_MTE
+ orr x10, x10, mte_tcr
+ .unreq mte_tcr
+#endif
tcr_clear_errata_bits x10, x9, x5
#ifdef CONFIG_ARM64_VA_BITS_52
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 807dc634bbd2..04137a8f3d2d 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -29,7 +29,7 @@
enum address_markers_idx {
PAGE_OFFSET_NR = 0,
PAGE_END_NR,
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
KASAN_START_NR,
#endif
};
@@ -37,7 +37,7 @@ enum address_markers_idx {
static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Linear Mapping start" },
{ 0 /* PAGE_END */, "Linear Mapping end" },
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
{ 0 /* KASAN_SHADOW_START */, "Kasan shadow start" },
{ KASAN_SHADOW_END, "Kasan shadow end" },
#endif
@@ -383,7 +383,7 @@ void ptdump_check_wx(void)
static int ptdump_init(void)
{
address_markers[PAGE_END_NR].start_address = PAGE_END;
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START;
#endif
ptdump_initialize();
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 033244462350..c8809959636f 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -362,3 +362,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index efd3ecb3cdfc..dde585616bf8 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 67ae5a5e4d21..4c09f27fedd0 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -447,3 +447,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index c59bc6acc47a..00921244242d 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -380,3 +380,4 @@
439 n32 faccessat2 sys_faccessat2
440 n32 process_madvise sys_process_madvise
441 n32 watch_mount sys_watch_mount
+443 n32 epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 50bbb9517052..b7920f76358d 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -356,3 +356,4 @@
439 n64 faccessat2 sys_faccessat2
440 n64 process_madvise sys_process_madvise
441 n64 watch_mount sys_watch_mount
+443 n64 epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1d5644e221b4..2ab5ed500113 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -429,3 +429,4 @@
439 o32 faccessat2 sys_faccessat2
440 o32 process_madvise sys_process_madvise
441 o32 watch_mount sys_watch_mount
+443 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 4492cc2fce23..9caf7d969846 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ee122bfc2ddc..dce635420226 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -531,3 +531,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index 211eb8244a45..1aaf2720b8f6 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -26,8 +26,8 @@ static inline void protect_kernel_text_data(void) {};
static inline int set_memory_rw_nx(unsigned long addr, int numpages) { return 0; }
#endif
-int set_direct_map_invalid_noflush(struct page *page);
-int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_invalid_noflush(struct page *page, int numpages);
+int set_direct_map_default_noflush(struct page *page, int numpages);
bool kernel_page_present(struct page *page);
#endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h
index 977ee6181dab..6c316093a1e5 100644
--- a/arch/riscv/include/asm/unistd.h
+++ b/arch/riscv/include/asm/unistd.h
@@ -9,6 +9,7 @@
*/
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_MEMFD_SECRET
#include <uapi/asm/unistd.h>
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 5e49e4b4a4cc..9618181b70be 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -156,11 +156,11 @@ int set_memory_nx(unsigned long addr, int numpages)
return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC));
}
-int set_direct_map_invalid_noflush(struct page *page)
+int set_direct_map_invalid_noflush(struct page *page, int numpages)
{
int ret;
unsigned long start = (unsigned long)page_address(page);
- unsigned long end = start + PAGE_SIZE;
+ unsigned long end = start + PAGE_SIZE * numpages;
struct pageattr_masks masks = {
.set_mask = __pgprot(0),
.clear_mask = __pgprot(_PAGE_PRESENT)
@@ -173,11 +173,11 @@ int set_direct_map_invalid_noflush(struct page *page)
return ret;
}
-int set_direct_map_default_noflush(struct page *page)
+int set_direct_map_default_noflush(struct page *page, int numpages)
{
int ret;
unsigned long start = (unsigned long)page_address(page);
- unsigned long end = start + PAGE_SIZE;
+ unsigned long end = start + PAGE_SIZE * numpages;
struct pageattr_masks masks = {
.set_mask = PAGE_KERNEL,
.clear_mask = __pgprot(0)
diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c
index b11e8108773a..faccb33b462c 100644
--- a/arch/s390/boot/string.c
+++ b/arch/s390/boot/string.c
@@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
#include "../lib/string.c"
int strncmp(const char *cs, const char *ct, size_t count)
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index b467b57d66d1..dedfa6db9f8d 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index de3bdbed8881..18f2d10c3176 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -170,7 +170,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock_mmap;
- ret = follow_pte_pmd(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+ ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
if (ret)
goto out_unlock_mmap;
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock_mmap;
- ret = follow_pte_pmd(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+ ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
if (ret)
goto out_unlock_mmap;
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index f9c7ca5cb969..f7a383ae2c48 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 341fd4ba053b..a6cdf450d0f4 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -487,3 +487,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6ea13f4ad07e..42d84d86f1f4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -42,7 +42,7 @@ config FORCE_DYNAMIC_FTRACE
in order to test the non static function tracing in the
generic code, as other architectures still use it. But we
only need to keep it around for x86_64. No need to keep it
- for x86_32. For x86_32, force DYNAMIC_FTRACE.
+ for x86_32. For x86_32, force DYNAMIC_FTRACE.
#
# Arch settings
#
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index d9a631c5973c..901ea5ebec22 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,6 +12,7 @@
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 81fc1eaa3229..d38b122f51ef 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -75,7 +75,7 @@ void *memcpy(void *dest, const void *src, size_t n)
}
#ifdef CONFIG_KASAN
-extern void *__memset(void *s, int c, size_t n) __alias(memset);
-extern void *__memmove(void *dest, const void *src, size_t n) __alias(memmove);
-extern void *__memcpy(void *dest, const void *src, size_t n) __alias(memcpy);
+extern void *__memset(void *s, int c, size_t n) __alias("memset");
+extern void *__memmove(void *dest, const void *src, size_t n) __alias("memmove");
+extern void *__memcpy(void *dest, const void *src, size_t n) __alias("memcpy");
#endif
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index c52ab1c4a755..5eec79578bba 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -446,3 +446,5 @@
439 i386 faccessat2 sys_faccessat2
440 i386 process_madvise sys_process_madvise
441 i386 watch_mount sys_watch_mount
+442 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+443 i386 memfd_secret sys_memfd_secret
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f3270a9ef467..4507caf8d63c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,8 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+442 common epoll_pwait2 sys_epoll_pwait2
+443 common memfd_secret sys_memfd_secret
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 4352f08bfbb5..6224cb291f6c 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -80,8 +80,8 @@ int set_pages_wb(struct page *page, int numpages);
int set_pages_ro(struct page *page, int numpages);
int set_pages_rw(struct page *page, int numpages);
-int set_direct_map_invalid_noflush(struct page *page);
-int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_invalid_noflush(struct page *page, int numpages);
+int set_direct_map_default_noflush(struct page *page, int numpages);
bool kernel_page_present(struct page *page);
extern int kernel_set_to_readonly;
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index a84333adeef2..f19d1bbbff3d 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -69,7 +69,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
long __##abi##_##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
long __##abi##_##name(const struct pt_regs *regs) \
- __alias(__do_##name);
+ __alias("__do_" #name);
#define __SYS_STUBx(abi, name, ...) \
long __##abi##_##name(const struct pt_regs *regs); \
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index c8daa92f38dc..5d3a0b8fd379 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -112,7 +112,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
/*
* The suspend path may have poisoned some areas deeper in the stack,
* which we now need to unpoison.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d2ad1e8a802..02d71b3b7bd4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9877,7 +9877,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = -ENOMEM;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page)
goto fail_free_lapic;
vcpu->arch.pio_data = page_address(page);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..d157fd617c99 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2184,14 +2184,14 @@ static int __set_pages_np(struct page *page, int numpages)
return __change_page_attr_set_clr(&cpa, 0);
}
-int set_direct_map_invalid_noflush(struct page *page)
+int set_direct_map_invalid_noflush(struct page *page, int numpages)
{
- return __set_pages_np(page, 1);
+ return __set_pages_np(page, numpages);
}
-int set_direct_map_default_noflush(struct page *page)
+int set_direct_map_default_noflush(struct page *page, int numpages)
{
- return __set_pages_p(page, 1);
+ return __set_pages_p(page, numpages);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 6225d87be81f..1c8ae5fd026e 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -412,3 +412,4 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common watch_mount sys_watch_mount
+443 common epoll_pwait2 sys_epoll_pwait2
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 1410beaef5c3..14e380ac65d4 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -162,7 +162,7 @@ static DEFINE_SEMAPHORE(efi_runtime_lock);
* Expose the EFI runtime lock to the UV platform
*/
#ifdef CONFIG_X86_UV
-extern struct semaphore __efi_uv_runtime_lock __alias(efi_runtime_lock);
+extern struct semaphore __efi_uv_runtime_lock __alias("efi_runtime_lock");
#endif
/*
diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..0f109eb16196 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -49,9 +49,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size)
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
-/* The order of a PMD entry */
-#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
-
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -98,7 +95,7 @@ static bool dax_is_locked(void *entry)
static unsigned int dax_entry_order(void *entry)
{
if (xa_to_value(entry) & DAX_PMD)
- return PMD_ORDER;
+ return PMD_PAGE_ORDER;
return 0;
}
@@ -810,12 +807,11 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide range to follow_pte_pmd it will
- * call mmu_notifier_invalidate_range_start() on our behalf
- * before taking any lock.
+ * Note because we provide range to follow_pte it will call
+ * mmu_notifier_invalidate_range_start() on our behalf before
+ * taking any lock.
*/
- if (follow_pte_pmd(vma->vm_mm, address, &range,
- &ptep, &pmdp, &ptl))
+ if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
continue;
/*
@@ -1471,7 +1467,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
- XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_PAGE_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
@@ -1530,7 +1526,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* entry is already in the array, for instance), it will return
* VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
+ entry = grab_mapping_entry(&xas, mapping, PMD_PAGE_ORDER);
if (xa_is_internal(entry)) {
result = xa_to_internal(entry);
goto fallback;
@@ -1696,7 +1692,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
#ifdef CONFIG_FS_DAX_PMD
- else if (order == PMD_ORDER)
+ else if (order == PMD_PAGE_ORDER)
ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
#endif
else
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 10b81e69db74..a829af074eb5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -389,19 +389,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*
* we must do our busy polling with irqs enabled
*/
-static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+ if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
-{
- if (ep->napi_id)
+ if (ep_events_available(ep))
+ return true;
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
ep->napi_id = 0;
+ return false;
+ }
+ return false;
}
/*
@@ -441,12 +446,9 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
#else
-static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
-{
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
+ return false;
}
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
@@ -1625,6 +1627,14 @@ static int ep_send_events(struct eventpoll *ep,
poll_table pt;
int res = 0;
+ /*
+ * Always short-circuit for fatal signals to allow threads to make a
+ * timely exit without the chance of finding more events available and
+ * fetching repeatedly.
+ */
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
@@ -1702,15 +1712,25 @@ static int ep_send_events(struct eventpoll *ep,
return res;
}
-static inline struct timespec64 ep_set_mstimeout(long ms)
+static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
- struct timespec64 now, ts = {
- .tv_sec = ms / MSEC_PER_SEC,
- .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
- };
+ struct timespec64 now;
+
+ if (ms < 0)
+ return NULL;
+
+ if (!ms) {
+ to->tv_sec = 0;
+ to->tv_nsec = 0;
+ return to;
+ }
+
+ to->tv_sec = ms / MSEC_PER_SEC;
+ to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
ktime_get_ts64(&now);
- return timespec64_add_safe(now, ts);
+ *to = timespec64_add_safe(now, *to);
+ return to;
}
/**
@@ -1722,8 +1742,8 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
- * milliseconds. If the @timeout is zero, the function will not block,
- * while if the @timeout is less than zero, the function will block
+ * timespec. If the timeout is zero, the function will not block,
+ * while if the @timeout ptr is NULL, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
@@ -1731,55 +1751,59 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents, long timeout)
+ int maxevents, struct timespec64 *timeout)
{
- int res = 0, eavail, timed_out = 0;
+ int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
- if (timeout > 0) {
- struct timespec64 end_time = ep_set_mstimeout(timeout);
-
- slack = select_estimate_accuracy(&end_time);
+ if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+ slack = select_estimate_accuracy(timeout);
to = &expires;
- *to = timespec64_to_ktime(end_time);
- } else if (timeout == 0) {
+ *to = timespec64_to_ktime(*timeout);
+ } else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation. We still need
- * lock because we could race and not see an epi being added
- * to the ready list while in irq callback. Thus incorrectly
- * returning 0 back to userspace.
+ * caller specified a non blocking operation.
*/
timed_out = 1;
-
- write_lock_irq(&ep->lock);
- eavail = ep_events_available(ep);
- write_unlock_irq(&ep->lock);
-
- goto send_events;
}
-fetch_events:
+ /*
+ * This call is racy: We may or may not see events that are being added
+ * to the ready list under the lock (e.g., in IRQ callbacks). For, cases
+ * with a non-zero timeout, this thread will check the ready list under
+ * lock and will added to the wait queue. For, cases with a zero
+ * timeout, the user by definition should not care and will have to
+ * recheck again.
+ */
+ eavail = ep_events_available(ep);
+
+ while (1) {
+ if (eavail) {
+ /*
+ * Try to transfer events to user space. In case we get
+ * 0 events and there's still timeout left over, we go
+ * trying again in search of more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res)
+ return res;
+ }
- if (!ep_events_available(ep))
- ep_busy_loop(ep, timed_out);
+ if (timed_out)
+ return 0;
- eavail = ep_events_available(ep);
- if (eavail)
- goto send_events;
+ eavail = ep_busy_loop(ep, timed_out);
+ if (eavail)
+ continue;
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ if (signal_pending(current))
+ return -EINTR;
- do {
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
@@ -1809,55 +1833,38 @@ fetch_events:
* important.
*/
eavail = ep_events_available(ep);
- if (!eavail) {
- if (signal_pending(current))
- res = -EINTR;
- else
- __add_wait_queue_exclusive(&ep->wq, &wait);
- }
- write_unlock_irq(&ep->lock);
-
- if (eavail || res)
- break;
-
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
- timed_out = 1;
- break;
- }
-
- /* We were woken up, thus go and try to harvest some events */
- eavail = 1;
-
- } while (0);
+ if (!eavail)
+ __add_wait_queue_exclusive(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
- __remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
- }
-send_events:
- if (fatal_signal_pending(current)) {
+ if (!eavail)
+ timed_out = !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
/*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
*/
- res = -EINTR;
- }
- /*
- * Try to transfer events to user space. In case we get 0 events and
- * there's still timeout left over, we go trying again in search of
- * more luck.
- */
- if (!res && eavail &&
- !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
- goto fetch_events;
+ eavail = 1;
- return res;
+ if (!list_empty_careful(&wait.entry)) {
+ write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue,
+ * it means that the thread was woken up after its
+ * timeout expired before it could reacquire the lock.
+ * Thus, when wait.entry is empty, it needs to harvest
+ * events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait.entry);
+ __remove_wait_queue(&ep->wq, &wait);
+ write_unlock_irq(&ep->lock);
+ }
+ }
}
/**
@@ -2176,7 +2183,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
- int maxevents, int timeout)
+ int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
@@ -2210,7 +2217,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = f.file->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, timeout);
+ error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
@@ -2220,16 +2227,19 @@ error_fput:
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
- return do_epoll_wait(epfd, events, maxevents, timeout);
+ struct timespec64 to;
+
+ return do_epoll_wait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout));
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
*/
-SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout, const sigset_t __user *, sigmask,
- size_t, sigsetsize)
+static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *to,
+ const sigset_t __user *sigmask, size_t sigsetsize)
{
int error;
@@ -2241,18 +2251,47 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
if (error)
return error;
- error = do_epoll_wait(epfd, events, maxevents, timeout);
+ error = do_epoll_wait(epfd, events, maxevents, to);
+
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+ size_t, sigsetsize)
+{
+ struct timespec64 to;
+
+ return do_epoll_pwait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout),
+ sigmask, sigsetsize);
+}
+
+SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, const struct __kernel_timespec __user *, timeout,
+ const sigset_t __user *, sigmask, size_t, sigsetsize)
+{
+ struct timespec64 ts, *to = NULL;
+
+ if (timeout) {
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+ to = &ts;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ return do_epoll_pwait(epfd, events, maxevents, to,
+ sigmask, sigsetsize);
+}
+
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
- struct epoll_event __user *, events,
- int, maxevents, int, timeout,
- const compat_sigset_t __user *, sigmask,
- compat_size_t, sigsetsize)
+static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *timeout,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize)
{
long err;
@@ -2265,10 +2304,46 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
+
restore_saved_sigmask_unless(err == -EINTR);
return err;
}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents, int, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ struct timespec64 to;
+
+ return do_compat_epoll_pwait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout),
+ sigmask, sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents,
+ const struct __kernel_timespec __user *, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ struct timespec64 ts, *to = NULL;
+
+ if (timeout) {
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+ to = &ts;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ return do_compat_epoll_pwait(epfd, events, maxevents, to,
+ sigmask, sigsetsize);
+}
+
#endif
static int __init eventpoll_init(void)
diff --git a/fs/exec.c b/fs/exec.c
index 5d4d52039105..9870163abdc2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1001,6 +1001,14 @@ static int exec_mmap(struct mm_struct *mm)
}
}
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ /*
+ * From here on, the mm may be accessed concurrently, and proper locking
+ * is required for things like get_user_pages_remote().
+ */
+ mm->mmap_lock_required = 1;
+#endif
+
task_lock(tsk);
membarrier_exec_mmap(mm);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 400c0941c8af..6e65be753603 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
int maxevents, int timeout,
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize);
+asmlinkage long compat_sys_epoll_pwait2(int epfd,
+ struct epoll_event __user *events,
+ int maxevents,
+ const struct __kernel_timespec __user *timeout,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize);
/* fs/fcntl.c */
asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index ea5e04e75845..42bfc97e2d15 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -42,7 +42,7 @@
/*
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alias-function-attribute
*/
-#define __alias(symbol) __attribute__((__alias__(#symbol)))
+#define __alias(symbol) __attribute__((__alias__(symbol)))
/*
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-aligned-function-attribute
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index ac6aba632f2d..ca5e89fb10d3 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -9,7 +9,7 @@
* even in compilation units that selectively disable KASAN, but must use KASAN
* to validate access to an address. Never use these in header files!
*/
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
bool __kasan_check_read(const volatile void *p, unsigned int size);
bool __kasan_check_write(const volatile void *p, unsigned int size);
#else
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 30d343b4a40a..5e0655fb2a6f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_KASAN_H
#define _LINUX_KASAN_H
+#include <linux/static_key.h>
#include <linux/types.h>
struct kmem_cache;
@@ -11,7 +12,7 @@ struct task_struct;
#ifdef CONFIG_KASAN
-#include <linux/pgtable.h>
+#include <linux/linkage.h>
#include <asm/kasan.h>
/* kasan_data struct is used in KUnit tests for KASAN expected failures */
@@ -20,6 +21,20 @@ struct kunit_kasan_expectation {
bool report_found;
};
+#endif
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
+#include <linux/pgtable.h>
+
+/* Software KASAN implementations use shadow memory. */
+
+#ifdef CONFIG_KASAN_SW_TAGS
+#define KASAN_SHADOW_INIT 0xFF
+#else
+#define KASAN_SHADOW_INIT 0
+#endif
+
extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
@@ -35,88 +50,219 @@ static inline void *kasan_mem_to_shadow(const void *addr)
+ KASAN_SHADOW_OFFSET;
}
+int kasan_add_zero_shadow(void *start, unsigned long size);
+void kasan_remove_zero_shadow(void *start, unsigned long size);
+
/* Enable reporting bugs after kasan_disable_current() */
extern void kasan_enable_current(void);
/* Disable reporting bugs for current task */
extern void kasan_disable_current(void);
-void kasan_unpoison_shadow(const void *address, size_t size);
-
-void kasan_unpoison_task_stack(struct task_struct *task);
-
-void kasan_alloc_pages(struct page *page, unsigned int order);
-void kasan_free_pages(struct page *page, unsigned int order);
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags);
+static inline int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ return 0;
+}
+static inline void kasan_remove_zero_shadow(void *start,
+ unsigned long size)
+{}
-void kasan_poison_slab(struct page *page);
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
-void kasan_poison_object_data(struct kmem_cache *cache, void *object);
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
- const void *object);
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
- gfp_t flags);
-void kasan_kfree_large(void *ptr, unsigned long ip);
-void kasan_poison_kfree(void *ptr, unsigned long ip);
-void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object,
- size_t size, gfp_t flags);
-void * __must_check kasan_krealloc(const void *object, size_t new_size,
- gfp_t flags);
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-void * __must_check kasan_slab_alloc(struct kmem_cache *s, void *object,
- gfp_t flags);
-bool kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
+#ifdef CONFIG_KASAN
struct kasan_cache {
int alloc_meta_offset;
int free_meta_offset;
};
-/*
- * These functions provide a special case to support backing module
- * allocations with real shadow memory. With KASAN vmalloc, the special
- * case is unnecessary, as the work is handled in the generic case.
- */
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size);
-void kasan_free_shadow(const struct vm_struct *vm);
-#else
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
-#endif
+#ifdef CONFIG_KASAN_HW_TAGS
-int kasan_add_zero_shadow(void *start, unsigned long size);
-void kasan_remove_zero_shadow(void *start, unsigned long size);
+DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
-size_t __ksize(const void *);
-static inline void kasan_unpoison_slab(const void *ptr)
+static __always_inline bool kasan_enabled(void)
{
- kasan_unpoison_shadow(ptr, __ksize(ptr));
+ return static_branch_likely(&kasan_flag_enabled);
}
-size_t kasan_metadata_size(struct kmem_cache *cache);
-bool kasan_save_enable_multi_shot(void);
-void kasan_restore_multi_shot(bool enabled);
+#else /* CONFIG_KASAN_HW_TAGS */
-#else /* CONFIG_KASAN */
+static inline bool kasan_enabled(void)
+{
+ return true;
+}
-static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+#endif /* CONFIG_KASAN_HW_TAGS */
-static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+slab_flags_t __kasan_never_merge(void);
+static __always_inline slab_flags_t kasan_never_merge(void)
+{
+ if (kasan_enabled())
+ return __kasan_never_merge();
+ return 0;
+}
-static inline void kasan_enable_current(void) {}
-static inline void kasan_disable_current(void) {}
+void __kasan_unpoison_range(const void *addr, size_t size);
+static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
+{
+ if (kasan_enabled())
+ __kasan_unpoison_range(addr, size);
+}
+
+void __kasan_alloc_pages(struct page *page, unsigned int order);
+static __always_inline void kasan_alloc_pages(struct page *page,
+ unsigned int order)
+{
+ if (kasan_enabled())
+ __kasan_alloc_pages(page, order);
+}
+
+void __kasan_free_pages(struct page *page, unsigned int order);
+static __always_inline void kasan_free_pages(struct page *page,
+ unsigned int order)
+{
+ if (kasan_enabled())
+ __kasan_free_pages(page, order);
+}
+
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags);
+static __always_inline void kasan_cache_create(struct kmem_cache *cache,
+ unsigned int *size, slab_flags_t *flags)
+{
+ if (kasan_enabled())
+ __kasan_cache_create(cache, size, flags);
+}
+
+size_t __kasan_metadata_size(struct kmem_cache *cache);
+static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+ if (kasan_enabled())
+ return __kasan_metadata_size(cache);
+ return 0;
+}
+
+void __kasan_poison_slab(struct page *page);
+static __always_inline void kasan_poison_slab(struct page *page)
+{
+ if (kasan_enabled())
+ __kasan_poison_slab(page);
+}
+
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+ void *object)
+{
+ if (kasan_enabled())
+ __kasan_unpoison_object_data(cache, object);
+}
+
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object);
+static __always_inline void kasan_poison_object_data(struct kmem_cache *cache,
+ void *object)
+{
+ if (kasan_enabled())
+ __kasan_poison_object_data(cache, object);
+}
+
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
+ const void *object);
+static __always_inline void * __must_check kasan_init_slab_obj(
+ struct kmem_cache *cache, const void *object)
+{
+ if (kasan_enabled())
+ return __kasan_init_slab_obj(cache, object);
+ return (void *)object;
+}
+
+bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
+static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+ unsigned long ip)
+{
+ if (kasan_enabled())
+ return __kasan_slab_free(s, object, ip);
+ return false;
+}
+
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
+static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip)
+{
+ if (kasan_enabled())
+ __kasan_slab_free_mempool(ptr, ip);
+}
+void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
+ void *object, gfp_t flags);
+static __always_inline void * __must_check kasan_slab_alloc(
+ struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (kasan_enabled())
+ return __kasan_slab_alloc(s, object, flags);
+ return object;
+}
+
+void * __must_check __kasan_kmalloc(struct kmem_cache *s, const void *object,
+ size_t size, gfp_t flags);
+static __always_inline void * __must_check kasan_kmalloc(struct kmem_cache *s,
+ const void *object, size_t size, gfp_t flags)
+{
+ if (kasan_enabled())
+ return __kasan_kmalloc(s, object, size, flags);
+ return (void *)object;
+}
+
+void * __must_check __kasan_kmalloc_large(const void *ptr,
+ size_t size, gfp_t flags);
+static __always_inline void * __must_check kasan_kmalloc_large(const void *ptr,
+ size_t size, gfp_t flags)
+{
+ if (kasan_enabled())
+ return __kasan_kmalloc_large(ptr, size, flags);
+ return (void *)ptr;
+}
+
+void * __must_check __kasan_krealloc(const void *object,
+ size_t new_size, gfp_t flags);
+static __always_inline void * __must_check kasan_krealloc(const void *object,
+ size_t new_size, gfp_t flags)
+{
+ if (kasan_enabled())
+ return __kasan_krealloc(object, new_size, flags);
+ return (void *)object;
+}
+
+void __kasan_kfree_large(void *ptr, unsigned long ip);
+static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
+{
+ if (kasan_enabled())
+ __kasan_kfree_large(ptr, ip);
+}
+
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
+#else /* CONFIG_KASAN */
+
+static inline bool kasan_enabled(void)
+{
+ return false;
+}
+static inline slab_flags_t kasan_never_merge(void)
+{
+ return 0;
+}
+static inline void kasan_unpoison_range(const void *address, size_t size) {}
static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
static inline void kasan_free_pages(struct page *page, unsigned int order) {}
-
static inline void kasan_cache_create(struct kmem_cache *cache,
unsigned int *size,
slab_flags_t *flags) {}
-
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
static inline void kasan_poison_slab(struct page *page) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
@@ -127,54 +273,42 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
{
return (void *)object;
}
-
-static inline void *kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+ unsigned long ip)
{
- return ptr;
+ return false;
+}
+static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {}
+static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
+ gfp_t flags)
+{
+ return object;
}
-static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
-static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object,
size_t size, gfp_t flags)
{
return (void *)object;
}
+static inline void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+ return (void *)ptr;
+}
static inline void *kasan_krealloc(const void *object, size_t new_size,
gfp_t flags)
{
return (void *)object;
}
-
-static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
- gfp_t flags)
-{
- return object;
-}
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
- unsigned long ip)
-{
- return false;
-}
-
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
-
-static inline int kasan_add_zero_shadow(void *start, unsigned long size)
-{
- return 0;
-}
-static inline void kasan_remove_zero_shadow(void *start,
- unsigned long size)
-{}
-
-static inline void kasan_unpoison_slab(const void *ptr) { }
-static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
+static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
#endif /* CONFIG_KASAN */
-#ifdef CONFIG_KASAN_GENERIC
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+void kasan_unpoison_task_stack(struct task_struct *task);
+#else
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+#endif
-#define KASAN_SHADOW_INIT 0
+#ifdef CONFIG_KASAN_GENERIC
void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
@@ -188,36 +322,50 @@ static inline void kasan_record_aux_stack(void *ptr) {}
#endif /* CONFIG_KASAN_GENERIC */
-#ifdef CONFIG_KASAN_SW_TAGS
-
-#define KASAN_SHADOW_INIT 0xFF
-
-void kasan_init_tags(void);
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-void *kasan_reset_tag(const void *addr);
+static inline void *kasan_reset_tag(const void *addr)
+{
+ return (void *)arch_kasan_reset_tag(addr);
+}
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
-#else /* CONFIG_KASAN_SW_TAGS */
-
-static inline void kasan_init_tags(void) { }
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
static inline void *kasan_reset_tag(const void *addr)
{
return (void *)addr;
}
-#endif /* CONFIG_KASAN_SW_TAGS */
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/
+
+#ifdef CONFIG_KASAN_SW_TAGS
+void __init kasan_init_sw_tags(void);
+#else
+static inline void kasan_init_sw_tags(void) { }
+#endif
+
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_init_hw_tags_cpu(void);
+void __init kasan_init_hw_tags(void);
+#else
+static inline void kasan_init_hw_tags_cpu(void) { }
+static inline void kasan_init_hw_tags(void) { }
+#endif
#ifdef CONFIG_KASAN_VMALLOC
+
int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
void kasan_poison_vmalloc(const void *start, unsigned long size);
void kasan_unpoison_vmalloc(const void *start, unsigned long size);
void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end);
-#else
+
+#else /* CONFIG_KASAN_VMALLOC */
+
static inline int kasan_populate_vmalloc(unsigned long start,
unsigned long size)
{
@@ -232,7 +380,26 @@ static inline void kasan_release_vmalloc(unsigned long start,
unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end) {}
-#endif
+
+#endif /* CONFIG_KASAN_VMALLOC */
+
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+ !defined(CONFIG_KASAN_VMALLOC)
+
+/*
+ * These functions provide a special case to support backing module
+ * allocations with real shadow memory. With KASAN vmalloc, the special
+ * case is unnecessary, as the work is handled in the generic case.
+ */
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_free_shadow(const struct vm_struct *vm);
+
+#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
+
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+
+#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
#ifdef CONFIG_KASAN_INLINE
void kasan_non_canonical_hook(unsigned long addr);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 08ed57e02b73..3febf64d1b80 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -475,19 +475,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
-/*
- * set_page_objcgs - associate a page with a object cgroups vector
- * @page: a pointer to the page struct
- * @objcgs: a pointer to the object cgroups vector
- *
- * Atomically associates a page with a vector of object cgroups.
- */
-static inline bool set_page_objcgs(struct page *page,
- struct obj_cgroup **objcgs)
-{
- return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
- MEMCG_DATA_OBJCGS);
-}
#else
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
@@ -498,12 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
return NULL;
}
-
-static inline bool set_page_objcgs(struct page *page,
- struct obj_cgroup **objcgs)
-{
- return true;
-}
#endif
static __always_inline bool memcg_stat_item_in_bytes(int idx)
@@ -620,9 +601,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
/**
* mem_cgroup_lruvec - get the lru list vector for a memcg & node
* @memcg: memcg of the wanted lruvec
+ * @pgdat: pglist_data
*
* Returns the lru list vector holding pages for a given @memcg &
- * @node combination. This can be the node lruvec, if the memory
+ * @pgdat combination. This can be the node lruvec, if the memory
* controller is disabled.
*/
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
@@ -652,7 +634,21 @@ out:
return lruvec;
}
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+/**
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * @page: the page
+ * @pgdat: pgdat of the page
+ *
+ * This function relies on page->mem_cgroup being stable.
+ */
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+ struct pglist_data *pgdat)
+{
+ struct mem_cgroup *memcg = page_memcg(page);
+
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
+ return mem_cgroup_lruvec(memcg, pgdat);
+}
static inline bool lruvec_holds_page_lru_lock(struct page *page,
struct lruvec *lruvec)
@@ -913,41 +909,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
local_irq_restore(flags);
}
-/**
- * mod_memcg_page_state - update page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * The @page must be locked or the caller must use lock_page_memcg()
- * to prevent double accounting when the page is concurrently being
- * moved to another memcg:
- *
- * lock_page(page) or lock_page_memcg(page)
- * if (TestClearPageState(page))
- * mod_memcg_page_state(page, state, -1);
- * unlock_page(page) or unlock_page_memcg(page)
- *
- * Kernel pages are an exception to this, since they'll never move.
- */
-static inline void __mod_memcg_page_state(struct page *page,
- int idx, int val)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (memcg)
- __mod_memcg_state(memcg, idx, val);
-}
-
-static inline void mod_memcg_page_state(struct page *page,
- int idx, int val)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (memcg)
- mod_memcg_state(memcg, idx, val);
-}
-
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
@@ -1395,18 +1356,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
{
}
-static inline void __mod_memcg_page_state(struct page *page,
- int idx,
- int nr)
-{
-}
-
-static inline void mod_memcg_page_state(struct page *page,
- int idx,
- int nr)
-{
-}
-
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
@@ -1479,34 +1428,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
}
#endif /* CONFIG_MEMCG */
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- __mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- __mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_page_state(struct page *page,
- int idx)
-{
- __mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_page_state(struct page *page,
- int idx)
-{
- __mod_memcg_page_state(page, idx, -1);
-}
-
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
__mod_lruvec_kmem_state(p, idx, 1);
@@ -1517,34 +1438,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
__mod_lruvec_kmem_state(p, idx, -1);
}
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_page_state(struct page *page,
- int idx)
-{
- mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_page_state(struct page *page,
- int idx)
-{
- mod_memcg_page_state(page, idx, -1);
-}
-
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
struct mem_cgroup *memcg;
@@ -1733,21 +1626,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order)
__memcg_kmem_uncharge_page(page, order);
}
-static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
-{
- if (memcg_kmem_enabled())
- return __memcg_kmem_charge(memcg, gfp, nr_pages);
- return 0;
-}
-
-static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages)
-{
- if (memcg_kmem_enabled())
- __memcg_kmem_uncharge(memcg, nr_pages);
-}
-
/*
* A helper for accessing memcg's kmem_id, used for getting
* corresponding LRU lists.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8dbaae697428..5299b90a6c40 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -31,6 +31,7 @@
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
+#include <linux/kasan.h>
struct mempolicy;
struct anon_vma;
@@ -1421,23 +1422,31 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
}
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+
static inline u8 page_kasan_tag(const struct page *page)
{
- return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+ if (kasan_enabled())
+ return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+ return 0xff;
}
static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
- page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
- page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+ if (kasan_enabled()) {
+ page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
+ page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+ }
}
static inline void page_kasan_tag_reset(struct page *page)
{
- page_kasan_tag_set(page, 0xff);
+ if (kasan_enabled())
+ page_kasan_tag_set(page, 0xff);
}
-#else
+
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
static inline u8 page_kasan_tag(const struct page *page)
{
return 0xff;
@@ -1445,7 +1454,8 @@ static inline u8 page_kasan_tag(const struct page *page)
static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }
-#endif
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
static inline struct zone *page_zone(const struct page *page)
{
@@ -1641,9 +1651,9 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
+ spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 07d9acb5b19c..65df8abd90bd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -561,6 +561,16 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
#endif
+
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ /*
+ * Notes whether this mm has been installed on a process yet.
+ * If not, only the task going through execve() can access this
+ * mm, and no locking is needed around get_user_pages_remote().
+ * This flag is only used for debug checks.
+ */
+ bool mmap_lock_required;
+#endif
} __randomize_layout;
/*
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 0540f0156f58..fe8193d94921 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -161,14 +161,22 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
static inline void mmap_assert_locked(struct mm_struct *mm)
{
- lockdep_assert_held(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ if (mm->mmap_lock_required) {
+ lockdep_assert_held(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+ }
+#endif
}
static inline void mmap_assert_write_locked(struct mm_struct *mm)
{
- lockdep_assert_held_write(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_VM)
+ if (mm->mmap_lock_required) {
+ lockdep_assert_held_write(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+ }
+#endif
}
static inline int mmap_lock_is_contended(struct mm_struct *mm)
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2ad72d2c8cc5..5d0767cb424a 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm);
BUG(); \
} \
} while (0)
+#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \
+ static bool __section(".data.once") __warned; \
+ int __ret_warn_once = !!(cond); \
+ \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
+ __warned = true; \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
#define VM_WARN_ON(cond) (void)WARN_ON(cond)
#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm);
#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 4fa67a8b2265..9e09d11ffe5b 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -96,7 +96,8 @@ void module_arch_cleanup(struct module *mod);
/* Any cleanup before freeing mod->module_init */
void module_arch_freeing_init(struct module *mod);
-#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+ !defined(CONFIG_KASAN_VMALLOC)
#include <linux/kasan.h>
#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
#else
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index e200eef6a7fd..7d4ec26d8a3e 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -77,7 +77,7 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
#define KASAN_TAG_WIDTH 8
#else
#define KASAN_TAG_WIDTH 0
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 8fcdfa52eb4b..ea5c4102c23e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -28,6 +28,9 @@
#define USER_PGTABLES_CEILING 0UL
#endif
+/* Number of base pages in a second level leaf page */
+#define PMD_PAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
/*
* A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfadcf0f9e30..e5ad6d354b7b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1235,7 +1235,7 @@ struct task_struct {
u64 timer_slack_ns;
u64 default_timer_slack_ns;
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
unsigned int kasan_depth;
#endif
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
new file mode 100644
index 000000000000..907a6734059c
--- /dev/null
+++ b/include/linux/secretmem.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_SECRETMEM_H
+#define _LINUX_SECRETMEM_H
+
+#ifdef CONFIG_SECRETMEM
+
+bool vma_is_secretmem(struct vm_area_struct *vma);
+bool page_is_secretmem(struct page *page);
+bool secretmem_active(void);
+
+#else
+
+static inline bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool page_is_secretmem(struct page *page)
+{
+ return false;
+}
+
+static inline bool secretmem_active(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_SECRETMEM */
+
+#endif /* _LINUX_SECRETMEM_H */
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index fe1aa4e54680..7b4b6626032d 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -15,11 +15,11 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP
-static inline int set_direct_map_invalid_noflush(struct page *page)
+static inline int set_direct_map_invalid_noflush(struct page *page, int numpages)
{
return 0;
}
-static inline int set_direct_map_default_noflush(struct page *page)
+static inline int set_direct_map_default_noflush(struct page *page, int numpages)
{
return 0;
}
@@ -28,7 +28,19 @@ static inline bool kernel_page_present(struct page *page)
{
return true;
}
+#else /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
+/*
+ * Some architectures, e.g. ARM64 can disable direct map modifications at
+ * boot time. Let them overrive this query.
+ */
+#ifndef can_set_direct_map
+static inline bool can_set_direct_map(void)
+{
+ return true;
+}
+#define can_set_direct_map can_set_direct_map
#endif
+#endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
#ifndef set_mce_nospec
static inline int set_mce_nospec(unsigned long pfn, bool unmap)
diff --git a/include/linux/string.h b/include/linux/string.h
index 1cd63a8a23ab..4fcfb56abcf5 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -267,7 +267,7 @@ void __write_overflow(void) __compiletime_error("detected write beyond size of o
#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7c4d99d1f74e..fb040fc0f940 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout,
const sigset_t __user *sigmask,
size_t sigsetsize);
+asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
+ int maxevents,
+ const struct __kernel_timespec __user *timeout,
+ const sigset_t __user *sigmask,
+ size_t sigsetsize);
/* fs/fcntl.c */
asmlinkage long sys_dup(unsigned int fildes);
@@ -1010,6 +1015,7 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
asmlinkage long sys_watch_mount(int dfd, const char __user *path,
unsigned int at_flags, int watch_fd, int watch_id);
+asmlinkage long sys_memfd_secret(unsigned long flags);
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 52c39a1daecf..9344654a2bb1 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -861,9 +861,15 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
__SYSCALL(__NR_process_madvise, sys_process_madvise)
#define __NR_watch_mount 441
__SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_epoll_pwait2 442
+__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
+#ifdef __ARCH_WANT_MEMFD_SECRET
+#define __NR_memfd_secret 443
+__SYSCALL(__NR_memfd_secret, sys_memfd_secret)
+#endif
#undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 444
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f3956fc11de6..35687dcb1a42 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -97,5 +97,6 @@
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define Z3FOLD_MAGIC 0x33
#define PPC_CMM_MAGIC 0xc7571590
+#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
#endif /* __LINUX_MAGIC_H__ */
diff --git a/init/init_task.c b/init/init_task.c
index 15f6eb93a04f..8a992d73e6fb 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -176,7 +176,7 @@ struct task_struct init_task
.numa_group = NULL,
.numa_faults = NULL,
#endif
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
.kasan_depth = 1,
#endif
#ifdef CONFIG_KCSAN
diff --git a/kernel/fork.c b/kernel/fork.c
index 41906a52a764..5a4fbd75e9b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
#include <linux/random.h>
#include <linux/tty.h>
#include <linux/blkdev.h>
+#include <linux/highmem.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
@@ -225,8 +226,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Clear the KASAN shadow of the stack. */
- kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+ /* Mark stack accessible for KASAN. */
+ kasan_unpoison_range(s->addr, THREAD_SIZE);
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
@@ -932,7 +933,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
account_kernel_stack(tsk, 1);
kcov_task_init(tsk);
- kmap_local_fork(tsk);
+// kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 3994a217bde7..465f6cfc317c 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
} \
EXPORT_SYMBOL(__tsan_read##size); \
void __tsan_unaligned_read##size(void *ptr) \
- __alias(__tsan_read##size); \
+ __alias("__tsan_read" #size); \
EXPORT_SYMBOL(__tsan_unaligned_read##size); \
void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
@@ -823,7 +823,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
} \
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
- __alias(__tsan_write##size); \
+ __alias("__tsan_write" #size); \
EXPORT_SYMBOL(__tsan_unaligned_write##size); \
void __tsan_read_write##size(void *ptr); \
void __tsan_read_write##size(void *ptr) \
@@ -833,7 +833,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
} \
EXPORT_SYMBOL(__tsan_read_write##size); \
void __tsan_unaligned_read_write##size(void *ptr) \
- __alias(__tsan_read_write##size); \
+ __alias("__tsan_read_write" #size); \
EXPORT_SYMBOL(__tsan_unaligned_read_write##size)
DEFINE_TSAN_READ_WRITE(1);
@@ -877,7 +877,7 @@ EXPORT_SYMBOL(__tsan_write_range);
} \
EXPORT_SYMBOL(__tsan_volatile_read##size); \
void __tsan_unaligned_volatile_read##size(void *ptr) \
- __alias(__tsan_volatile_read##size); \
+ __alias("__tsan_volatile_read" #size); \
EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \
void __tsan_volatile_write##size(void *ptr); \
void __tsan_volatile_write##size(void *ptr) \
@@ -892,7 +892,7 @@ EXPORT_SYMBOL(__tsan_write_range);
} \
EXPORT_SYMBOL(__tsan_volatile_write##size); \
void __tsan_unaligned_volatile_write##size(void *ptr) \
- __alias(__tsan_volatile_write##size); \
+ __alias("__tsan_volatile_write" #size); \
EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size)
DEFINE_TSAN_VOLATILE_READ_WRITE(1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index da0b41914177..559acef3fddb 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,6 +31,7 @@
#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
+#include <linux/secretmem.h>
#include <trace/events/power.h>
#include "power.h"
@@ -81,7 +82,9 @@ void hibernate_release(void)
bool hibernation_available(void)
{
- return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
+ return nohibernate == 0 &&
+ !security_locked_down(LOCKDOWN_HIBERNATION) &&
+ !secretmem_active();
}
/**
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d63560e1cf87..64b7aab9aee4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -86,7 +86,7 @@ static inline void hibernate_restore_unprotect_page(void *page_address) {}
static inline void hibernate_map_page(struct page *page)
{
if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
- int ret = set_direct_map_default_noflush(page);
+ int ret = set_direct_map_default_noflush(page, 1);
if (ret)
pr_warn_once("Failed to remap page\n");
@@ -99,7 +99,7 @@ static inline void hibernate_unmap_page(struct page *page)
{
if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
unsigned long addr = (unsigned long)page_address(page);
- int ret = set_direct_map_invalid_noflush(page);
+ int ret = set_direct_map_invalid_noflush(page, 1);
if (ret)
pr_warn_once("Failed to remap page\n");
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 21b548b69455..fca7d323dde4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4094,16 +4094,16 @@ static inline void finish_lock_switch(struct rq *rq)
static inline void kmap_local_sched_out(void)
{
#ifdef CONFIG_KMAP_LOCAL
- if (unlikely(current->kmap_ctrl.idx))
- __kmap_local_sched_out();
+// if (unlikely(current->kmap_ctrl.idx))
+// __kmap_local_sched_out();
#endif
}
static inline void kmap_local_sched_in(void)
{
#ifdef CONFIG_KMAP_LOCAL
- if (unlikely(current->kmap_ctrl.idx))
- __kmap_local_sched_in();
+// if (unlikely(current->kmap_ctrl.idx))
+// __kmap_local_sched_in();
#endif
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2dd6cbb8cabc..869aa6b5bf34 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
/* fs/fcntl.c */
@@ -353,6 +355,8 @@ COND_SYSCALL(pkey_mprotect);
COND_SYSCALL(pkey_alloc);
COND_SYSCALL(pkey_free);
+/* memfd_secret */
+COND_SYSCALL(memfd_secret);
/*
* Architecture specific weak syscall entries.
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 8fb097057fec..f5fa4ba126bf 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -6,7 +6,10 @@ config HAVE_ARCH_KASAN
config HAVE_ARCH_KASAN_SW_TAGS
bool
-config HAVE_ARCH_KASAN_VMALLOC
+config HAVE_ARCH_KASAN_HW_TAGS
+ bool
+
+config HAVE_ARCH_KASAN_VMALLOC
bool
config CC_HAS_KASAN_GENERIC
@@ -15,15 +18,20 @@ config CC_HAS_KASAN_GENERIC
config CC_HAS_KASAN_SW_TAGS
def_bool $(cc-option, -fsanitize=kernel-hwaddress)
+# This option is only required for software KASAN modes.
+# Old GCC versions don't have proper support for no_sanitize_address.
+# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89124 for details.
config CC_HAS_WORKING_NOSANITIZE_ADDRESS
def_bool !CC_IS_GCC || GCC_VERSION >= 80300
menuconfig KASAN
bool "KASAN: runtime memory debugger"
- depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \
- (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)
+ depends on (((HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \
+ (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \
+ CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \
+ HAVE_ARCH_KASAN_HW_TAGS
depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
- depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
+ select STACKDEPOT
help
Enables KASAN (KernelAddressSANitizer) - runtime memory debugger,
designed to find out-of-bounds accesses and use-after-free bugs.
@@ -35,21 +43,24 @@ choice
prompt "KASAN mode"
default KASAN_GENERIC
help
- KASAN has two modes: generic KASAN (similar to userspace ASan,
- x86_64/arm64/xtensa, enabled with CONFIG_KASAN_GENERIC) and
- software tag-based KASAN (a version based on software memory
- tagging, arm64 only, similar to userspace HWASan, enabled with
- CONFIG_KASAN_SW_TAGS).
+ KASAN has three modes:
+ 1. generic KASAN (similar to userspace ASan,
+ x86_64/arm64/xtensa, enabled with CONFIG_KASAN_GENERIC),
+ 2. software tag-based KASAN (arm64 only, based on software
+ memory tagging (similar to userspace HWASan), enabled with
+ CONFIG_KASAN_SW_TAGS), and
+ 3. hardware tag-based KASAN (arm64 only, based on hardware
+ memory tagging, enabled with CONFIG_KASAN_HW_TAGS).
+
+ All KASAN modes are strictly debugging features.
- Both generic and tag-based KASAN are strictly debugging features.
+ For better error reports enable CONFIG_STACKTRACE.
config KASAN_GENERIC
bool "Generic mode"
depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC
- depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
select SLUB_DEBUG if SLUB
select CONSTRUCTORS
- select STACKDEPOT
help
Enables generic KASAN mode.
@@ -62,23 +73,22 @@ config KASAN_GENERIC
and introduces an overhead of ~x1.5 for the rest of the allocations.
The performance slowdown is ~x3.
- For better error detection enable CONFIG_STACKTRACE.
-
Currently CONFIG_KASAN_GENERIC doesn't work with CONFIG_DEBUG_SLAB
(the resulting kernel does not boot).
config KASAN_SW_TAGS
bool "Software tag-based mode"
depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS
- depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
select SLUB_DEBUG if SLUB
select CONSTRUCTORS
- select STACKDEPOT
help
Enables software tag-based KASAN mode.
- This mode requires Top Byte Ignore support by the CPU and therefore
- is only supported for arm64. This mode requires Clang.
+ This mode require software memory tagging support in the form of
+ HWASan-like compiler instrumentation.
+
+ Currently this mode is only implemented for arm64 CPUs and relies on
+ Top Byte Ignore. This mode requires Clang.
This mode consumes about 1/16th of available memory at kernel start
and introduces an overhead of ~20% for the rest of the allocations.
@@ -86,15 +96,27 @@ config KASAN_SW_TAGS
casting and comparison, as it embeds tags into the top byte of each
pointer.
- For better error detection enable CONFIG_STACKTRACE.
-
Currently CONFIG_KASAN_SW_TAGS doesn't work with CONFIG_DEBUG_SLAB
(the resulting kernel does not boot).
+config KASAN_HW_TAGS
+ bool "Hardware tag-based mode"
+ depends on HAVE_ARCH_KASAN_HW_TAGS
+ depends on SLUB
+ help
+ Enables hardware tag-based KASAN mode.
+
+ This mode requires hardware memory tagging support, and can be used
+ by any architecture that provides it.
+
+ Currently this mode is only implemented for arm64 CPUs starting from
+ ARMv8.5 and relies on Memory Tagging Extension and Top Byte Ignore.
+
endchoice
choice
prompt "Instrumentation type"
+ depends on KASAN_GENERIC || KASAN_SW_TAGS
default KASAN_OUTLINE
config KASAN_OUTLINE
@@ -118,6 +140,7 @@ endchoice
config KASAN_STACK_ENABLE
bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST
+ depends on KASAN_GENERIC || KASAN_SW_TAGS
help
The LLVM stack address sanitizer has a know problem that
causes excessive stack usage in a lot of functions, see
@@ -146,7 +169,7 @@ config KASAN_SW_TAGS_IDENTIFY
config KASAN_VMALLOC
bool "Back mappings in vmalloc space with real shadow memory"
- depends on HAVE_ARCH_KASAN_VMALLOC
+ depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC
help
By default, the shadow region for vmalloc space is the read-only
zero page. This means that KASAN cannot detect errors involving
diff --git a/lib/crc32.c b/lib/crc32.c
index 2a68dfd3b96c..373a17aaa432 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -206,8 +206,8 @@ u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
EXPORT_SYMBOL(crc32_le);
EXPORT_SYMBOL(__crc32c_le);
-u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le);
-u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le);
+u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias("crc32_le");
+u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias("__crc32c_le");
/*
* This multiplies the polynomials x and y modulo the given modulus.
diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c
index 827fe89922ff..5b80514595c2 100644
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -82,8 +82,8 @@ static volatile const u8 __cacheline_aligned aes_inv_sbox[] = {
0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
};
-extern const u8 crypto_aes_sbox[256] __alias(aes_sbox);
-extern const u8 crypto_aes_inv_sbox[256] __alias(aes_inv_sbox);
+extern const u8 crypto_aes_sbox[256] __alias("aes_sbox");
+extern const u8 crypto_aes_inv_sbox[256] __alias("aes_inv_sbox");
EXPORT_SYMBOL(crypto_aes_sbox);
EXPORT_SYMBOL(crypto_aes_inv_sbox);
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 662f862702fc..2947274cc2d3 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -25,7 +25,7 @@
#include "../mm/kasan/kasan.h"
-#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE)
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
/*
* We assign some test results to these globals to make sure the tests
diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c
index 62a87854b120..3b4cc77992d2 100644
--- a/lib/test_kasan_module.c
+++ b/lib/test_kasan_module.c
@@ -15,7 +15,7 @@
#include "../mm/kasan/kasan.h"
-#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE)
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
static noinline void __init copy_user_test(void)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 4275c25b5d8a..7af1a55b708e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -875,4 +875,9 @@ config MAPPING_DIRTY_HELPERS
config KMAP_LOCAL
bool
+config SECRETMEM
+ def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+ select GENERIC_ALLOCATOR
+ select CMA
+
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index a1af02ba8f3f..6b581f8337e8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -121,3 +121,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_SECRETMEM) += secretmem.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 9c28c3f3a8a3..13ef23f72330 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,7 @@
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
+#include <linux/secretmem.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -844,7 +845,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
page->mapping = mapping;
page->index = offset;
- if (!huge) {
+ if (!huge && !page_is_secretmem(page)) {
error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
@@ -1359,7 +1360,7 @@ static int __wait_on_page_locked_async(struct page *page,
else
ret = PageLocked(page);
/*
- * If we were succesful now, we know we're still on the
+ * If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
* safe to remove and return success, we know the callback
* isn't going to trigger.
diff --git a/mm/gup.c b/mm/gup.c
index e4c224cd9661..4f3cf14ac958 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/secretmem.h>
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
@@ -759,6 +760,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
struct follow_page_context ctx = { NULL };
struct page *page;
+ if (vma_is_secretmem(vma))
+ return NULL;
+
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -892,6 +896,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if (vma_is_secretmem(vma))
+ return -EFAULT;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
@@ -996,6 +1003,8 @@ static long __get_user_pages(struct mm_struct *mm,
struct vm_area_struct *vma = NULL;
struct follow_page_context ctx = { NULL };
+ mmap_assert_locked(mm);
+
if (!nr_pages)
return 0;
@@ -2031,6 +2040,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ if (page_is_secretmem(page))
+ goto pte_unmap;
+
head = try_grab_compound_head(page, 1, flags);
if (!head)
goto pte_unmap;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f6067ca17bd1..10dd3cae5f53 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2393,7 +2393,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* Clone page flags before unfreezing refcount.
*
* After successful get_page_unless_zero() might follow flags change,
- * for exmaple lock_page() which set PG_waiters.
+ * for example lock_page() which set PG_waiters.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
diff --git a/mm/internal.h b/mm/internal.h
index 9902648f2206..8e9c660f33ca 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -353,6 +353,9 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);
+extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
+ unsigned long len);
+
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
* we want to unconditionally remove a page from the pagecache -- e.g.,
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 370d970e5ab5..9fe39a66388a 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -6,12 +6,15 @@ KCOV_INSTRUMENT := n
# Disable ftrace to avoid recursion.
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
@@ -22,13 +25,17 @@ CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-obj-$(CONFIG_KASAN) := common.o init.o report.o
-obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
-obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
+obj-$(CONFIG_KASAN) := common.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
+obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o
+obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index de92da1b637a..da79f340f3a6 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains common generic and tag-based KASAN code.
+ * This file contains common KASAN code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/export.h>
@@ -19,7 +14,6 @@
#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/kfence.h>
-#include <linux/kmemleak.h>
#include <linux/linkage.h>
#include <linux/memblock.h>
#include <linux/memory.h>
@@ -32,12 +26,8 @@
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/vmalloc.h>
#include <linux/bug.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
#include "kasan.h"
#include "../slab.h"
@@ -57,6 +47,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags)
track->stack = kasan_save_stack(flags);
}
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
void kasan_enable_current(void)
{
current->kasan_depth++;
@@ -66,118 +57,20 @@ void kasan_disable_current(void)
{
current->kasan_depth--;
}
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-bool __kasan_check_read(const volatile void *p, unsigned int size)
-{
- return check_memory_region((unsigned long)p, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__kasan_check_read);
-
-bool __kasan_check_write(const volatile void *p, unsigned int size)
-{
- return check_memory_region((unsigned long)p, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__kasan_check_write);
-
-#undef memset
-void *memset(void *addr, int c, size_t len)
-{
- if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
- return NULL;
-
- return __memset(addr, c, len);
-}
-
-#ifdef __HAVE_ARCH_MEMMOVE
-#undef memmove
-void *memmove(void *dest, const void *src, size_t len)
+void __kasan_unpoison_range(const void *address, size_t size)
{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
- return NULL;
-
- return __memmove(dest, src, len);
-}
-#endif
-
-#undef memcpy
-void *memcpy(void *dest, const void *src, size_t len)
-{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
- return NULL;
-
- return __memcpy(dest, src, len);
-}
-
-/*
- * Poisons the shadow memory for 'size' bytes starting from 'addr'.
- * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
- */
-void kasan_poison_shadow(const void *address, size_t size, u8 value)
-{
- void *shadow_start, *shadow_end;
-
- /*
- * Perform shadow offset calculation based on untagged address, as
- * some of the callers (e.g. kasan_poison_object_data) pass tagged
- * addresses to this function.
- */
- address = reset_tag(address);
-
- /* Skip KFENCE memory if called explicitly outside of sl*b. */
- if (is_kfence_address(address))
- return;
-
- shadow_start = kasan_mem_to_shadow(address);
- shadow_end = kasan_mem_to_shadow(address + size);
-
- __memset(shadow_start, value, shadow_end - shadow_start);
-}
-
-void kasan_unpoison_shadow(const void *address, size_t size)
-{
- u8 tag = get_tag(address);
-
- /*
- * Perform shadow offset calculation based on untagged address, as
- * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
- * addresses to this function.
- */
- address = reset_tag(address);
-
- /*
- * Skip KFENCE memory if called explicitly outside of sl*b. Also note
- * that calls to ksize(), where size is not a multiple of machine-word
- * size, would otherwise poison the invalid portion of the word.
- */
- if (is_kfence_address(address))
- return;
-
- kasan_poison_shadow(address, size, tag);
-
- if (size & KASAN_SHADOW_MASK) {
- u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- *shadow = tag;
- else
- *shadow = size & KASAN_SHADOW_MASK;
- }
-}
-
-static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
-{
- void *base = task_stack_page(task);
- size_t size = sp - base;
-
- kasan_unpoison_shadow(base, size);
+ unpoison_range(address, size);
}
+#if CONFIG_KASAN_STACK
/* Unpoison the entire stack for a task. */
void kasan_unpoison_task_stack(struct task_struct *task)
{
- __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+ void *base = task_stack_page(task);
+
+ unpoison_range(base, THREAD_SIZE);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -190,10 +83,22 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- kasan_unpoison_shadow(base, watermark - base);
+ unpoison_range(base, watermark - base);
+}
+#endif /* CONFIG_KASAN_STACK */
+
+/*
+ * Only allow cache merging when stack collection is disabled and no metadata
+ * is present.
+ */
+slab_flags_t __kasan_never_merge(void)
+{
+ if (kasan_stack_collection_enabled())
+ return SLAB_KASAN;
+ return 0;
}
-void kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order)
{
u8 tag;
unsigned long i;
@@ -204,13 +109,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order)
tag = random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+ unpoison_range(page_address(page), PAGE_SIZE << order);
}
-void kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order)
{
if (likely(!PageHighMem(page)))
- kasan_poison_shadow(page_address(page),
+ poison_range(page_address(page),
PAGE_SIZE << order,
KASAN_FREE_PAGE);
}
@@ -221,9 +126,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
*/
static inline unsigned int optimal_redzone(unsigned int object_size)
{
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- return 0;
-
return
object_size <= 64 - 16 ? 16 :
object_size <= 128 - 32 ? 32 :
@@ -234,88 +136,129 @@ static inline unsigned int optimal_redzone(unsigned int object_size)
object_size <= (1 << 16) - 1024 ? 1024 : 2048;
}
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags)
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
{
- unsigned int orig_size = *size;
- unsigned int redzone_size;
- int redzone_adjust;
+ unsigned int ok_size;
+ unsigned int optimal_size;
- /* Add alloc meta. */
- cache->kasan_info.alloc_meta_offset = *size;
- *size += sizeof(struct kasan_alloc_meta);
+ /*
+ * SLAB_KASAN is used to mark caches as ones that are sanitized by
+ * KASAN. Currently this flag is used in two places:
+ * 1. In slab_ksize() when calculating the size of the accessible
+ * memory within the object.
+ * 2. In slab_common.c to prevent merging of sanitized caches.
+ */
+ *flags |= SLAB_KASAN;
- /* Add free meta. */
- if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta))) {
- cache->kasan_info.free_meta_offset = *size;
- *size += sizeof(struct kasan_free_meta);
- }
+ if (!kasan_stack_collection_enabled())
+ return;
- redzone_size = optimal_redzone(cache->object_size);
- redzone_adjust = redzone_size - (*size - cache->object_size);
- if (redzone_adjust > 0)
- *size += redzone_adjust;
+ ok_size = *size;
- *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
- max(*size, cache->object_size + redzone_size));
+ /* Add alloc meta into redzone. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
/*
- * If the metadata doesn't fit, don't enable KASAN at all.
+ * If alloc meta doesn't fit, don't add it.
+ * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
+ * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
+ * larger sizes.
*/
- if (*size <= cache->kasan_info.alloc_meta_offset ||
- *size <= cache->kasan_info.free_meta_offset) {
+ if (*size > KMALLOC_MAX_SIZE) {
cache->kasan_info.alloc_meta_offset = 0;
- cache->kasan_info.free_meta_offset = 0;
- *size = orig_size;
+ *size = ok_size;
+ /* Continue, since free meta might still fit. */
+ }
+
+ /* Only the generic mode uses free meta or flexible redzones. */
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
return;
}
- *flags |= SLAB_KASAN;
+ /*
+ * Add free meta into redzone when it's not possible to store
+ * it in the object. This is the case when:
+ * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
+ * be touched after it was freed, or
+ * 2. Object has a constructor, which means it's expected to
+ * retain its content until the next allocation, or
+ * 3. Object is too small.
+ * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
+ */
+ if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ ok_size = *size;
+
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+
+ /* If free meta doesn't fit, don't add it. */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
+ *size = ok_size;
+ }
+ }
+
+ /* Calculate size with optimal redzone. */
+ optimal_size = cache->object_size + optimal_redzone(cache->object_size);
+ /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
+ if (optimal_size > KMALLOC_MAX_SIZE)
+ optimal_size = KMALLOC_MAX_SIZE;
+ /* Use optimal size if the size with added metas is not large enough. */
+ if (*size < optimal_size)
+ *size = optimal_size;
}
-size_t kasan_metadata_size(struct kmem_cache *cache)
+size_t __kasan_metadata_size(struct kmem_cache *cache)
{
+ if (!kasan_stack_collection_enabled())
+ return 0;
return (cache->kasan_info.alloc_meta_offset ?
sizeof(struct kasan_alloc_meta) : 0) +
(cache->kasan_info.free_meta_offset ?
sizeof(struct kasan_free_meta) : 0);
}
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object)
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object)
{
- return (void *)object + cache->kasan_info.alloc_meta_offset;
+ if (!cache->kasan_info.alloc_meta_offset)
+ return NULL;
+ return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset;
}
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object)
+#ifdef CONFIG_KASAN_GENERIC
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object)
{
BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- return (void *)object + cache->kasan_info.free_meta_offset;
+ if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
+ return NULL;
+ return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
}
+#endif
-void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct page *page)
{
unsigned long i;
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- kasan_poison_shadow(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE);
+ poison_range(page_address(page), page_size(page),
+ KASAN_KMALLOC_REDZONE);
}
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_unpoison_shadow(object, cache->object_size);
+ unpoison_range(object, cache->object_size);
}
-void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_poison_shadow(object,
- round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_KMALLOC_REDZONE);
+ poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE);
}
/*
@@ -335,6 +278,9 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
static u8 assign_tag(struct kmem_cache *cache, const void *object,
bool init, bool keep_tag)
{
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return 0xff;
+
/*
* 1. When an object is kmalloc()'ed, two hooks are called:
* kasan_slab_alloc() and kasan_kmalloc(). We assign the
@@ -364,50 +310,32 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
#endif
}
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
const void *object)
{
- struct kasan_alloc_meta *alloc_info;
-
- if (!(cache->flags & SLAB_KASAN))
- return (void *)object;
+ struct kasan_alloc_meta *alloc_meta;
- alloc_info = get_alloc_info(cache, object);
- __memset(alloc_info, 0, sizeof(*alloc_info));
+ if (kasan_stack_collection_enabled()) {
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ __memset(alloc_meta, 0, sizeof(*alloc_meta));
+ }
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- object = set_tag(object,
- assign_tag(cache, object, true, false));
+ /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
+ object = set_tag(object, assign_tag(cache, object, true, false));
return (void *)object;
}
-static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
-{
- if (IS_ENABLED(CONFIG_KASAN_GENERIC))
- return shadow_byte < 0 ||
- shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
-
- /* else CONFIG_KASAN_SW_TAGS: */
- if ((u8)shadow_byte == KASAN_TAG_INVALID)
- return true;
- if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
- return true;
-
- return false;
-}
-
-static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
unsigned long ip, bool quarantine)
{
- s8 shadow_byte;
u8 tag;
void *tagged_object;
- unsigned long rounded_up_size;
tag = get_tag(object);
tagged_object = object;
- object = reset_tag(object);
+ object = kasan_reset_tag(object);
if (is_kfence_address(object))
return false;
@@ -422,37 +350,67 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
return false;
- shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
- if (shadow_invalid(tag, shadow_byte)) {
+ if (check_invalid_free(tagged_object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
}
- rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+ poison_range(object, cache->object_size, KASAN_KMALLOC_FREE);
- if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
- unlikely(!(cache->flags & SLAB_KASAN)))
+ if (!kasan_stack_collection_enabled())
+ return false;
+
+ if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
kasan_set_free_info(cache, object, tag);
- quarantine_put(get_free_info(cache, object), cache);
+ return quarantine_put(cache, object);
+}
+
+bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+{
+ return ____kasan_slab_free(cache, object, ip, true);
+}
+
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
+{
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
- return IS_ENABLED(CONFIG_KASAN_GENERIC);
+ /*
+ * Even though this function is only called for kmem_cache_alloc and
+ * kmalloc backed mempool allocations, those allocations can still be
+ * !PageSlab() when the size provided to kmalloc is larger than
+ * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
+ */
+ if (unlikely(!PageSlab(page))) {
+ if (ptr != page_address(page)) {
+ kasan_report_invalid_free(ptr, ip);
+ return;
+ }
+ poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
+ } else {
+ ____kasan_slab_free(page->slab_cache, ptr, ip, false);
+ }
}
-bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
{
- return __kasan_slab_free(cache, object, ip, true);
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->alloc_track, flags);
}
-static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
size_t size, gfp_t flags, bool keep_tag)
{
unsigned long redzone_start;
unsigned long redzone_end;
- u8 tag = 0xff;
+ u8 tag;
if (gfpflags_allow_blocking(flags))
quarantine_reduce();
@@ -464,38 +422,36 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
return (void *)object;
redzone_start = round_up((unsigned long)(object + size),
- KASAN_SHADOW_SCALE_SIZE);
+ KASAN_GRANULE_SIZE);
redzone_end = round_up((unsigned long)object + cache->object_size,
- KASAN_SHADOW_SCALE_SIZE);
+ KASAN_GRANULE_SIZE);
+ tag = assign_tag(cache, object, false, keep_tag);
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false, keep_tag);
+ /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
+ unpoison_range(set_tag(object, tag), size);
+ poison_range((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
- /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
- kasan_unpoison_shadow(set_tag(object, tag), size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
-
- if (cache->flags & SLAB_KASAN)
- kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+ if (kasan_stack_collection_enabled())
+ set_alloc_info(cache, (void *)object, flags);
return set_tag(object, tag);
}
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
- gfp_t flags)
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+ void *object, gfp_t flags)
{
- return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+ return ____kasan_kmalloc(cache, object, cache->object_size, flags, false);
}
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags)
+void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
{
- return __kasan_kmalloc(cache, object, size, flags, true);
+ return ____kasan_kmalloc(cache, object, size, flags, true);
}
-EXPORT_SYMBOL(kasan_kmalloc);
+EXPORT_SYMBOL(__kasan_kmalloc);
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
gfp_t flags)
{
struct page *page;
@@ -510,17 +466,17 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
- KASAN_SHADOW_SCALE_SIZE);
+ KASAN_GRANULE_SIZE);
redzone_end = (unsigned long)ptr + page_size(page);
- kasan_unpoison_shadow(ptr, size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
+ unpoison_range(ptr, size);
+ poison_range((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE);
return (void *)ptr;
}
-void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
@@ -530,421 +486,15 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
- return kasan_kmalloc_large(object, size, flags);
+ return __kasan_kmalloc_large(object, size, flags);
else
- return __kasan_kmalloc(page->slab_cache, object, size,
+ return ____kasan_kmalloc(page->slab_cache, object, size,
flags, true);
}
-void kasan_poison_kfree(void *ptr, unsigned long ip)
-{
- struct page *page;
-
- page = virt_to_head_page(ptr);
-
- if (unlikely(!PageSlab(page))) {
- if (ptr != page_address(page)) {
- kasan_report_invalid_free(ptr, ip);
- return;
- }
- kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
- } else {
- __kasan_slab_free(page->slab_cache, ptr, ip, false);
- }
-}
-
-void kasan_kfree_large(void *ptr, unsigned long ip)
+void __kasan_kfree_large(void *ptr, unsigned long ip)
{
if (ptr != page_address(virt_to_head_page(ptr)))
kasan_report_invalid_free(ptr, ip);
- /* The object will be poisoned by page_alloc. */
+ /* The object will be poisoned by kasan_free_pages(). */
}
-
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size)
-{
- void *ret;
- size_t scaled_size;
- size_t shadow_size;
- unsigned long shadow_start;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
- scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
- shadow_size = round_up(scaled_size, PAGE_SIZE);
-
- if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
- return -EINVAL;
-
- ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
- shadow_start + shadow_size,
- GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
- __builtin_return_address(0));
-
- if (ret) {
- __memset(ret, KASAN_SHADOW_INIT, shadow_size);
- find_vm_area(addr)->flags |= VM_KASAN;
- kmemleak_ignore(ret);
- return 0;
- }
-
- return -ENOMEM;
-}
-
-void kasan_free_shadow(const struct vm_struct *vm)
-{
- if (vm->flags & VM_KASAN)
- vfree(kasan_mem_to_shadow(vm->addr));
-}
-#endif
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static bool shadow_mapped(unsigned long addr)
-{
- pgd_t *pgd = pgd_offset_k(addr);
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (pgd_none(*pgd))
- return false;
- p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d))
- return false;
- pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
- return false;
-
- /*
- * We can't use pud_large() or pud_huge(), the first one is
- * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
- * pud_bad(), if pud is bad then it's bad because it's huge.
- */
- if (pud_bad(*pud))
- return true;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- return false;
-
- if (pmd_bad(*pmd))
- return true;
- pte = pte_offset_kernel(pmd, addr);
- return !pte_none(*pte);
-}
-
-static int __meminit kasan_mem_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct memory_notify *mem_data = data;
- unsigned long nr_shadow_pages, start_kaddr, shadow_start;
- unsigned long shadow_end, shadow_size;
-
- nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
- start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
- shadow_size = nr_shadow_pages << PAGE_SHIFT;
- shadow_end = shadow_start + shadow_size;
-
- if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
- WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
- return NOTIFY_BAD;
-
- switch (action) {
- case MEM_GOING_ONLINE: {
- void *ret;
-
- /*
- * If shadow is mapped already than it must have been mapped
- * during the boot. This could happen if we onlining previously
- * offlined memory.
- */
- if (shadow_mapped(shadow_start))
- return NOTIFY_OK;
-
- ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
- shadow_end, GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD,
- pfn_to_nid(mem_data->start_pfn),
- __builtin_return_address(0));
- if (!ret)
- return NOTIFY_BAD;
-
- kmemleak_ignore(ret);
- return NOTIFY_OK;
- }
- case MEM_CANCEL_ONLINE:
- case MEM_OFFLINE: {
- struct vm_struct *vm;
-
- /*
- * shadow_start was either mapped during boot by kasan_init()
- * or during memory online by __vmalloc_node_range().
- * In the latter case we can use vfree() to free shadow.
- * Non-NULL result of the find_vm_area() will tell us if
- * that was the second case.
- *
- * Currently it's not possible to free shadow mapped
- * during boot by kasan_init(). It's because the code
- * to do that hasn't been written yet. So we'll just
- * leak the memory.
- */
- vm = find_vm_area((void *)shadow_start);
- if (vm)
- vfree((void *)shadow_start);
- }
- }
-
- return NOTIFY_OK;
-}
-
-static int __init kasan_memhotplug_init(void)
-{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
-
- return 0;
-}
-
-core_initcall(kasan_memhotplug_init);
-#endif
-
-#ifdef CONFIG_KASAN_VMALLOC
-static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
-{
- unsigned long page;
- pte_t pte;
-
- if (likely(!pte_none(*ptep)))
- return 0;
-
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
-
- spin_lock(&init_mm.page_table_lock);
- if (likely(pte_none(*ptep))) {
- set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
- }
- spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
- return 0;
-}
-
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
-{
- unsigned long shadow_start, shadow_end;
- int ret;
-
- if (!is_vmalloc_or_module_addr((void *)addr))
- return 0;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
- shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
- shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
- shadow_end = ALIGN(shadow_end, PAGE_SIZE);
-
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
- if (ret)
- return ret;
-
- flush_cache_vmap(shadow_start, shadow_end);
-
- /*
- * We need to be careful about inter-cpu effects here. Consider:
- *
- * CPU#0 CPU#1
- * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
- * p[99] = 1;
- *
- * With compiler instrumentation, that ends up looking like this:
- *
- * CPU#0 CPU#1
- * // vmalloc() allocates memory
- * // let a = area->addr
- * // we reach kasan_populate_vmalloc
- * // and call kasan_unpoison_shadow:
- * STORE shadow(a), unpoison_val
- * ...
- * STORE shadow(a+99), unpoison_val x = LOAD p
- * // rest of vmalloc process <data dependency>
- * STORE p, a LOAD shadow(x+99)
- *
- * If there is no barrier between the end of unpoisioning the shadow
- * and the store of the result to p, the stores could be committed
- * in a different order by CPU#0, and CPU#1 could erroneously observe
- * poison in the shadow.
- *
- * We need some sort of barrier between the stores.
- *
- * In the vmalloc() case, this is provided by a smp_wmb() in
- * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
- * get_vm_area() and friends, the caller gets shadow allocated but
- * doesn't have any pages mapped into the virtual address space that
- * has been reserved. Mapping those pages in will involve taking and
- * releasing a page-table lock, which will provide the barrier.
- */
-
- return 0;
-}
-
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
-}
-
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- kasan_unpoison_shadow(start, size);
-}
-
-static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
-{
- unsigned long page;
-
- page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
-
- spin_lock(&init_mm.page_table_lock);
-
- if (likely(!pte_none(*ptep))) {
- pte_clear(&init_mm, addr, ptep);
- free_page(page);
- }
- spin_unlock(&init_mm.page_table_lock);
-
- return 0;
-}
-
-/*
- * Release the backing for the vmalloc region [start, end), which
- * lies within the free region [free_region_start, free_region_end).
- *
- * This can be run lazily, long after the region was freed. It runs
- * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
- * infrastructure.
- *
- * How does this work?
- * -------------------
- *
- * We have a region that is page aligned, labelled as A.
- * That might not map onto the shadow in a way that is page-aligned:
- *
- * start end
- * v v
- * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
- * -------- -------- -------- -------- --------
- * | | | | |
- * | | | /-------/ |
- * \-------\|/------/ |/---------------/
- * ||| ||
- * |??AAAAAA|AAAAAAAA|AA??????| < shadow
- * (1) (2) (3)
- *
- * First we align the start upwards and the end downwards, so that the
- * shadow of the region aligns with shadow page boundaries. In the
- * example, this gives us the shadow page (2). This is the shadow entirely
- * covered by this allocation.
- *
- * Then we have the tricky bits. We want to know if we can free the
- * partially covered shadow pages - (1) and (3) in the example. For this,
- * we are given the start and end of the free region that contains this
- * allocation. Extending our previous example, we could have:
- *
- * free_region_start free_region_end
- * | start end |
- * v v v v
- * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
- * -------- -------- -------- -------- --------
- * | | | | |
- * | | | /-------/ |
- * \-------\|/------/ |/---------------/
- * ||| ||
- * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
- * (1) (2) (3)
- *
- * Once again, we align the start of the free region up, and the end of
- * the free region down so that the shadow is page aligned. So we can free
- * page (1) - we know no allocation currently uses anything in that page,
- * because all of it is in the vmalloc free region. But we cannot free
- * page (3), because we can't be sure that the rest of it is unused.
- *
- * We only consider pages that contain part of the original region for
- * freeing: we don't try to free other pages from the free region or we'd
- * end up trying to free huge chunks of virtual address space.
- *
- * Concurrency
- * -----------
- *
- * How do we know that we're not freeing a page that is simultaneously
- * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
- *
- * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
- * at the same time. While we run under free_vmap_area_lock, the population
- * code does not.
- *
- * free_vmap_area_lock instead operates to ensure that the larger range
- * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
- * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
- * no space identified as free will become used while we are running. This
- * means that so long as we are careful with alignment and only free shadow
- * pages entirely covered by the free region, we will not run in to any
- * trouble - any simultaneous allocations will be for disjoint regions.
- */
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
- unsigned long free_region_start,
- unsigned long free_region_end)
-{
- void *shadow_start, *shadow_end;
- unsigned long region_start, region_end;
- unsigned long size;
-
- region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
- region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- free_region_start = ALIGN(free_region_start,
- PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- if (start != region_start &&
- free_region_start < region_start)
- region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
-
- free_region_end = ALIGN_DOWN(free_region_end,
- PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- if (end != region_end &&
- free_region_end > region_end)
- region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
-
- shadow_start = kasan_mem_to_shadow((void *)region_start);
- shadow_end = kasan_mem_to_shadow((void *)region_end);
-
- if (shadow_end > shadow_start) {
- size = shadow_end - shadow_start;
- apply_to_existing_page_range(&init_mm,
- (unsigned long)shadow_start,
- size, kasan_depopulate_vmalloc_pte,
- NULL);
- flush_tlb_kernel_range((unsigned long)shadow_start,
- (unsigned long)shadow_end);
- }
-}
-#endif
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 9bcb4b8d7320..90d74c521317 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -7,15 +7,8 @@
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -52,7 +45,7 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr)
s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ s8 last_accessible_byte = addr & KASAN_GRANULE_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
@@ -68,7 +61,7 @@ static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
* Access crosses 8(shadow size)-byte boundary. Such access maps
* into 2 shadow bytes, so we need to check them both.
*/
- if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
return memory_is_poisoned_1(addr + size - 1);
@@ -79,7 +72,7 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE)))
return *shadow_addr || memory_is_poisoned_1(addr + 15);
return *shadow_addr;
@@ -140,7 +133,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
return true;
}
return false;
@@ -193,6 +186,13 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
return check_memory_region_inline(addr, size, write, ret_ip);
}
+bool check_invalid_free(void *addr)
+{
+ s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
+
+ return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE;
+}
+
void kasan_cache_shrink(struct kmem_cache *cache)
{
quarantine_remove_cache(cache);
@@ -206,13 +206,13 @@ void kasan_cache_shutdown(struct kmem_cache *cache)
static void register_global(struct kasan_global *global)
{
- size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+ size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- kasan_unpoison_shadow(global->beg, global->size);
+ unpoison_range(global->beg, global->size);
- kasan_poison_shadow(global->beg + aligned_size,
- global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
+ poison_range(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
}
void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -235,7 +235,7 @@ EXPORT_SYMBOL(__asan_unregister_globals);
check_memory_region_inline(addr, size, false, _RET_IP_);\
} \
EXPORT_SYMBOL(__asan_load##size); \
- __alias(__asan_load##size) \
+ __alias("__asan_load" #size) \
void __asan_load##size##_noabort(unsigned long); \
EXPORT_SYMBOL(__asan_load##size##_noabort); \
void __asan_store##size(unsigned long addr) \
@@ -243,7 +243,7 @@ EXPORT_SYMBOL(__asan_unregister_globals);
check_memory_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
- __alias(__asan_store##size) \
+ __alias("__asan_store" #size) \
void __asan_store##size##_noabort(unsigned long); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
@@ -259,7 +259,7 @@ void __asan_loadN(unsigned long addr, size_t size)
}
EXPORT_SYMBOL(__asan_loadN);
-__alias(__asan_loadN)
+__alias("__asan_loadN")
void __asan_loadN_noabort(unsigned long, size_t);
EXPORT_SYMBOL(__asan_loadN_noabort);
@@ -269,7 +269,7 @@ void __asan_storeN(unsigned long addr, size_t size)
}
EXPORT_SYMBOL(__asan_storeN);
-__alias(__asan_storeN)
+__alias("__asan_storeN")
void __asan_storeN_noabort(unsigned long, size_t);
EXPORT_SYMBOL(__asan_storeN_noabort);
@@ -280,10 +280,10 @@ EXPORT_SYMBOL(__asan_handle_no_return);
/* Emitted by compiler to poison alloca()ed objects. */
void __asan_alloca_poison(unsigned long addr, size_t size)
{
- size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE);
size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
rounded_up_size;
- size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t rounded_down_size = round_down(size, KASAN_GRANULE_SIZE);
const void *left_redzone = (const void *)(addr -
KASAN_ALLOCA_REDZONE_SIZE);
@@ -291,13 +291,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
- kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
- kasan_poison_shadow(right_redzone,
- padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
+ unpoison_range((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT);
+ poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -307,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+ unpoison_range(stack_top, stack_bottom - stack_top);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
@@ -330,7 +329,7 @@ void kasan_record_aux_stack(void *addr)
{
struct page *page = kasan_addr_to_page(addr);
struct kmem_cache *cache;
- struct kasan_alloc_meta *alloc_info;
+ struct kasan_alloc_meta *alloc_meta;
void *object;
if (is_kfence_address(addr) || !(page && PageSlab(page)))
@@ -338,10 +337,10 @@ void kasan_record_aux_stack(void *addr)
cache = page->slab_cache;
object = nearest_obj(cache, page, addr);
- alloc_info = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
- alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
- alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+ alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
+ alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
}
void kasan_set_free_info(struct kmem_cache *cache,
@@ -349,12 +348,12 @@ void kasan_set_free_info(struct kmem_cache *cache,
{
struct kasan_free_meta *free_meta;
- free_meta = get_free_info(cache, object);
- kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ free_meta = kasan_get_free_meta(cache, object);
+ if (!free_meta)
+ return;
- /*
- * the object was freed and has free track set
- */
+ kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ /* The object was freed and has free track set. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
}
@@ -363,5 +362,6 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
{
if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
return NULL;
- return &get_free_info(cache, object)->free_track;
+ /* Free meta must be present with KASAN_KMALLOC_FREETRACK. */
+ return &kasan_get_free_meta(cache, object)->free_track;
}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
deleted file mode 100644
index a38c7a9e192a..000000000000
--- a/mm/kasan/generic_report.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This file contains generic KASAN specific error reporting code.
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * Some code borrowed from https://github.com/xairy/kasan-prototype by
- * Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/bitops.h>
-#include <linux/ftrace.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/stackdepot.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/kasan.h>
-#include <linux/module.h>
-
-#include <asm/sections.h>
-
-#include "kasan.h"
-#include "../slab.h"
-
-void *find_first_bad_addr(void *addr, size_t size)
-{
- void *p = addr;
-
- while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
- p += KASAN_SHADOW_SCALE_SIZE;
- return p;
-}
-
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
- u8 *shadow_addr;
-
- shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
-
- /*
- * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
- * at the next shadow byte to determine the type of the bad access.
- */
- if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
- shadow_addr++;
-
- switch (*shadow_addr) {
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- /*
- * In theory it's still possible to see these shadow values
- * due to a data race in the kernel code.
- */
- bug_type = "out-of-bounds";
- break;
- case KASAN_PAGE_REDZONE:
- case KASAN_KMALLOC_REDZONE:
- bug_type = "slab-out-of-bounds";
- break;
- case KASAN_GLOBAL_REDZONE:
- bug_type = "global-out-of-bounds";
- break;
- case KASAN_STACK_LEFT:
- case KASAN_STACK_MID:
- case KASAN_STACK_RIGHT:
- case KASAN_STACK_PARTIAL:
- bug_type = "stack-out-of-bounds";
- break;
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- case KASAN_KMALLOC_FREETRACK:
- bug_type = "use-after-free";
- break;
- case KASAN_ALLOCA_LEFT:
- case KASAN_ALLOCA_RIGHT:
- bug_type = "alloca-out-of-bounds";
- break;
- case KASAN_VMALLOC_INVALID:
- bug_type = "vmalloc-out-of-bounds";
- break;
- }
-
- return bug_type;
-}
-
-static const char *get_wild_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
-
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
-
- return bug_type;
-}
-
-const char *get_bug_type(struct kasan_access_info *info)
-{
- /*
- * If access_size is a negative number, then it has reason to be
- * defined as out-of-bounds bug type.
- *
- * Casting negative numbers to size_t would indeed turn up as
- * a large size_t and its value will be larger than ULONG_MAX/2,
- * so that this can qualify as out-of-bounds.
- */
- if (info->access_addr + info->access_size < info->access_addr)
- return "out-of-bounds";
-
- if (addr_has_shadow(info->access_addr))
- return get_shadow_bug_type(info);
- return get_wild_bug_type(info);
-}
-
-#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, false, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_load##size##_noabort)
-
-#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, true, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_store##size##_noabort)
-
-DEFINE_ASAN_REPORT_LOAD(1);
-DEFINE_ASAN_REPORT_LOAD(2);
-DEFINE_ASAN_REPORT_LOAD(4);
-DEFINE_ASAN_REPORT_LOAD(8);
-DEFINE_ASAN_REPORT_LOAD(16);
-DEFINE_ASAN_REPORT_STORE(1);
-DEFINE_ASAN_REPORT_STORE(2);
-DEFINE_ASAN_REPORT_STORE(4);
-DEFINE_ASAN_REPORT_STORE(8);
-DEFINE_ASAN_REPORT_STORE(16);
-
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_load_n_noabort);
-
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
new file mode 100644
index 000000000000..55bd6f09c70f
--- /dev/null
+++ b/mm/kasan/hw_tags.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core hardware tag-based KASAN code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/static_key.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+enum kasan_arg_mode {
+ KASAN_ARG_MODE_DEFAULT,
+ KASAN_ARG_MODE_OFF,
+ KASAN_ARG_MODE_PROD,
+ KASAN_ARG_MODE_FULL,
+};
+
+enum kasan_arg_stacktrace {
+ KASAN_ARG_STACKTRACE_DEFAULT,
+ KASAN_ARG_STACKTRACE_OFF,
+ KASAN_ARG_STACKTRACE_ON,
+};
+
+enum kasan_arg_fault {
+ KASAN_ARG_FAULT_DEFAULT,
+ KASAN_ARG_FAULT_REPORT,
+ KASAN_ARG_FAULT_PANIC,
+};
+
+static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
+static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
+
+/* Whether KASAN is enabled at all. */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
+EXPORT_SYMBOL(kasan_flag_enabled);
+
+/* Whether to collect alloc/free stack traces. */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+
+/* Whether panic or disable tag checking on fault. */
+bool kasan_flag_panic __ro_after_init;
+
+/* kasan.mode=off/prod/full */
+static int __init early_kasan_mode(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_mode = KASAN_ARG_MODE_OFF;
+ else if (!strcmp(arg, "prod"))
+ kasan_arg_mode = KASAN_ARG_MODE_PROD;
+ else if (!strcmp(arg, "full"))
+ kasan_arg_mode = KASAN_ARG_MODE_FULL;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.mode", early_kasan_mode);
+
+/* kasan.stack=off/on */
+static int __init early_kasan_flag_stacktrace(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
+
+/* kasan.fault=report/panic */
+static int __init early_kasan_fault(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "report"))
+ kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
+ else if (!strcmp(arg, "panic"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.fault", early_kasan_fault);
+
+/* kasan_init_hw_tags_cpu() is called for each CPU. */
+void kasan_init_hw_tags_cpu(void)
+{
+ /*
+ * There's no need to check that the hardware is MTE-capable here,
+ * as this function is only called for MTE-capable hardware.
+ */
+
+ /* If KASAN is disabled, do nothing. */
+ if (kasan_arg_mode == KASAN_ARG_MODE_OFF)
+ return;
+
+ hw_init_tags(KASAN_TAG_MAX);
+ hw_enable_tagging();
+}
+
+/* kasan_init_hw_tags() is called once on boot CPU. */
+void __init kasan_init_hw_tags(void)
+{
+ /* If hardware doesn't support MTE, do nothing. */
+ if (!system_supports_mte())
+ return;
+
+ /* Choose KASAN mode if kasan boot parameter is not provided. */
+ if (kasan_arg_mode == KASAN_ARG_MODE_DEFAULT) {
+ if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+ kasan_arg_mode = KASAN_ARG_MODE_FULL;
+ else
+ kasan_arg_mode = KASAN_ARG_MODE_PROD;
+ }
+
+ /* Preset parameter values based on the mode. */
+ switch (kasan_arg_mode) {
+ case KASAN_ARG_MODE_DEFAULT:
+ /* Shouldn't happen as per the check above. */
+ WARN_ON(1);
+ return;
+ case KASAN_ARG_MODE_OFF:
+ /* If KASAN is disabled, do nothing. */
+ return;
+ case KASAN_ARG_MODE_PROD:
+ static_branch_enable(&kasan_flag_enabled);
+ break;
+ case KASAN_ARG_MODE_FULL:
+ static_branch_enable(&kasan_flag_enabled);
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
+
+ /* Now, optionally override the presets. */
+
+ switch (kasan_arg_stacktrace) {
+ case KASAN_ARG_STACKTRACE_DEFAULT:
+ break;
+ case KASAN_ARG_STACKTRACE_OFF:
+ static_branch_disable(&kasan_flag_stacktrace);
+ break;
+ case KASAN_ARG_STACKTRACE_ON:
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
+
+ switch (kasan_arg_fault) {
+ case KASAN_ARG_FAULT_DEFAULT:
+ break;
+ case KASAN_ARG_FAULT_REPORT:
+ kasan_flag_panic = false;
+ break;
+ case KASAN_ARG_FAULT_PANIC:
+ kasan_flag_panic = true;
+ break;
+ }
+
+ pr_info("KernelAddressSanitizer initialized\n");
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return NULL;
+
+ return &alloc_meta->free_track[0];
+}
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index fe6be0be1f76..bc0ad208b3a7 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -1,14 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains some kasan initialization code.
+ * This file contains KASAN shadow initialization code.
*
* Copyright (c) 2015 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/memblock.h>
@@ -446,9 +441,8 @@ void kasan_remove_zero_shadow(void *start, unsigned long size)
addr = (unsigned long)kasan_mem_to_shadow(start);
end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
- if (WARN_ON((unsigned long)start %
- (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
- WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) ||
+ WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE))
return;
for (; addr < end; addr = next) {
@@ -481,9 +475,8 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
shadow_start = kasan_mem_to_shadow(start);
shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
- if (WARN_ON((unsigned long)start %
- (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
- WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) ||
+ WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE))
return -EINVAL;
ret = kasan_populate_early_shadow(shadow_start, shadow_end);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ac499456740f..725a472e8ea7 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -3,10 +3,35 @@
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
+#include <linux/kfence.h>
#include <linux/stackdepot.h>
-#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
-#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+#ifdef CONFIG_KASAN_HW_TAGS
+#include <linux/static_key.h>
+DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return static_branch_unlikely(&kasan_flag_stacktrace);
+}
+#else
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return true;
+}
+#endif
+
+extern bool kasan_flag_panic __ro_after_init;
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#else
+#include <asm/mte-kasan.h>
+#define KASAN_GRANULE_SIZE MTE_GRANULE_SIZE
+#endif
+
+#define KASAN_GRANULE_MASK (KASAN_GRANULE_SIZE - 1)
+
+#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT)
#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
@@ -56,6 +81,13 @@
#define KASAN_ABI_VERSION 1
#endif
+/* Metadata layout customization. */
+#define META_BYTES_PER_BLOCK 1
+#define META_BLOCKS_PER_ROW 16
+#define META_BYTES_PER_ROW (META_BLOCKS_PER_ROW * META_BYTES_PER_BLOCK)
+#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
+#define META_ROWS_AROUND_ADDR 2
+
struct kasan_access_info {
const void *access_addr;
const void *first_bad_addr;
@@ -124,20 +156,33 @@ struct kasan_alloc_meta {
struct qlist_node {
struct qlist_node *next;
};
+
+/*
+ * Generic mode either stores free meta in the object itself or in the redzone
+ * after the object. In the former case free meta offset is 0, in the latter
+ * case it has some sane value smaller than INT_MAX. Use INT_MAX as free meta
+ * offset when free meta isn't present.
+ */
+#define KASAN_NO_FREE_META INT_MAX
+
struct kasan_free_meta {
+#ifdef CONFIG_KASAN_GENERIC
/* This field is used while the object is in the quarantine.
* Otherwise it might be used for the allocator freelist.
*/
struct qlist_node quarantine_link;
-#ifdef CONFIG_KASAN_GENERIC
struct kasan_track free_track;
#endif
};
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object);
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object);
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object);
+#ifdef CONFIG_KASAN_GENERIC
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object);
+#endif
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
@@ -145,13 +190,11 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool addr_has_shadow(const void *addr)
+static inline bool addr_has_metadata(const void *addr)
{
return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
}
-void kasan_poison_shadow(const void *address, size_t size, u8 value);
-
/**
* check_memory_region - Check memory region, and report if invalid access.
* @addr: the accessed address
@@ -163,8 +206,30 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value);
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip);
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline bool addr_has_metadata(const void *addr)
+{
+ return true;
+}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+void print_tags(u8 addr_tag, const void *addr);
+#else
+static inline void print_tags(u8 addr_tag, const void *addr) { }
+#endif
+
void *find_first_bad_addr(void *addr, size_t size);
const char *get_bug_type(struct kasan_access_info *info);
+void metadata_fetch_row(char *buffer, void *row);
+
+#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK
+void print_address_stack_frame(const void *addr);
+#else
+static inline void print_address_stack_frame(const void *addr) { }
+#endif
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
@@ -180,49 +245,100 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
-void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+bool quarantine_put(struct kmem_cache *cache, void *object);
void quarantine_reduce(void);
void quarantine_remove_cache(struct kmem_cache *cache);
#else
-static inline void quarantine_put(struct kasan_free_meta *info,
- struct kmem_cache *cache) { }
+static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; }
static inline void quarantine_reduce(void) { }
static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
-#ifdef CONFIG_KASAN_SW_TAGS
+#ifndef arch_kasan_set_tag
+static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
+{
+ return addr;
+}
+#endif
+#ifndef arch_kasan_get_tag
+#define arch_kasan_get_tag(addr) 0
+#endif
-void print_tags(u8 addr_tag, const void *addr);
+#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
+#define get_tag(addr) arch_kasan_get_tag(addr)
-u8 random_tag(void);
+#ifdef CONFIG_KASAN_HW_TAGS
+#ifndef arch_enable_tagging
+#define arch_enable_tagging()
+#endif
+#ifndef arch_init_tags
+#define arch_init_tags(max_tag)
+#endif
+#ifndef arch_get_random_tag
+#define arch_get_random_tag() (0xFF)
+#endif
+#ifndef arch_get_mem_tag
+#define arch_get_mem_tag(addr) (0xFF)
+#endif
+#ifndef arch_set_mem_tag_range
+#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr))
+#endif
+
+#define hw_enable_tagging() arch_enable_tagging()
+#define hw_init_tags(max_tag) arch_init_tags(max_tag)
+#define hw_get_random_tag() arch_get_random_tag()
+#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
+#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_SW_TAGS
+u8 random_tag(void);
+#elif defined(CONFIG_KASAN_HW_TAGS)
+static inline u8 random_tag(void) { return hw_get_random_tag(); }
#else
+static inline u8 random_tag(void) { return 0; }
+#endif
-static inline void print_tags(u8 addr_tag, const void *addr) { }
+#ifdef CONFIG_KASAN_HW_TAGS
-static inline u8 random_tag(void)
+static inline void poison_range(const void *address, size_t size, u8 value)
{
- return 0;
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(address))
+ return;
+
+ hw_set_mem_tag_range(kasan_reset_tag(address),
+ round_up(size, KASAN_GRANULE_SIZE), value);
}
-#endif
+static inline void unpoison_range(const void *address, size_t size)
+{
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(address))
+ return;
-#ifndef arch_kasan_set_tag
-static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
+ hw_set_mem_tag_range(kasan_reset_tag(address),
+ round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
+}
+
+static inline bool check_invalid_free(void *addr)
{
- return addr;
+ u8 ptr_tag = get_tag(addr);
+ u8 mem_tag = hw_get_mem_tag(addr);
+
+ return (mem_tag == KASAN_TAG_INVALID) ||
+ (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
}
-#endif
-#ifndef arch_kasan_reset_tag
-#define arch_kasan_reset_tag(addr) ((void *)(addr))
-#endif
-#ifndef arch_kasan_get_tag
-#define arch_kasan_get_tag(addr) 0
-#endif
-#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
-#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
-#define get_tag(addr) arch_kasan_get_tag(addr)
+#else /* CONFIG_KASAN_HW_TAGS */
+
+void poison_range(const void *address, size_t size, u8 value);
+void unpoison_range(const void *address, size_t size);
+bool check_invalid_free(void *addr);
+
+#endif /* CONFIG_KASAN_HW_TAGS */
/*
* Exported functions for interfaces called from assembly or from generated
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 0e3f8494628f..55783125a767 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -6,16 +6,6 @@
* Copyright (C) 2016 Google, Inc.
*
* Based on code by Dmitry Chernenkov.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
*/
#include <linux/gfp.h>
@@ -147,7 +137,12 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
+ /*
+ * As the object now gets freed from the quaratine, assume that its
+ * free track is no longer valid.
+ */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
+
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
@@ -173,11 +168,19 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
qlist_init(q);
}
-void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+bool quarantine_put(struct kmem_cache *cache, void *object)
{
unsigned long flags;
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
+
+ /*
+ * If there's no metadata for this object, don't put it into
+ * quarantine.
+ */
+ if (!meta)
+ return false;
/*
* Note: irq must be disabled until after we move the batch to the
@@ -192,9 +195,9 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
q = this_cpu_ptr(&cpu_quarantine);
if (q->offline) {
local_irq_restore(flags);
- return;
+ return false;
}
- qlist_put(q, &info->quarantine_link, cache->size);
+ qlist_put(q, &meta->quarantine_link, cache->size);
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
@@ -215,6 +218,8 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
}
local_irq_restore(flags);
+
+ return true;
}
void quarantine_reduce(void)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5a0102f37171..c0fb21797550 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains common generic and tag-based KASAN error reporting code.
+ * This file contains common KASAN error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/bitops.h>
@@ -38,12 +33,6 @@
#include "kasan.h"
#include "../slab.h"
-/* Shadow layout customization. */
-#define SHADOW_BYTES_PER_BLOCK 1
-#define SHADOW_BLOCKS_PER_ROW 16
-#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
-#define SHADOW_ROWS_AROUND_ADDR 2
-
static unsigned long kasan_flags;
#define KASAN_BIT_REPORTED 0
@@ -73,9 +62,14 @@ static void print_error_description(struct kasan_access_info *info)
{
pr_err("BUG: KASAN: %s in %pS\n",
get_bug_type(info), (void *)info->ip);
- pr_err("%s of size %zu at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read", info->access_size,
- info->access_addr, current->comm, task_pid_nr(current));
+ if (info->access_size)
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
+ else
+ pr_err("%s at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_addr, current->comm, task_pid_nr(current));
}
static DEFINE_SPINLOCK(report_lock);
@@ -105,6 +99,10 @@ static void end_report(unsigned long *flags)
panic_on_warn = 0;
panic("panic_on_warn set ...\n");
}
+#ifdef CONFIG_KASAN_HW_TAGS
+ if (kasan_flag_panic)
+ panic("kasan.fault=panic set ...\n");
+#endif
kasan_enable_current();
}
@@ -167,36 +165,45 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object_stacks(struct kmem_cache *cache, void *object,
+ const void *addr, u8 tag)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ struct kasan_alloc_meta *alloc_meta;
+ struct kasan_track *free_track;
- if (cache->flags & SLAB_KASAN) {
- struct kasan_track *free_track;
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta) {
+ print_track(&alloc_meta->alloc_track, "Allocated");
+ pr_err("\n");
+ }
- print_track(&alloc_info->alloc_track, "Allocated");
+ free_track = kasan_get_free_track(cache, object, tag);
+ if (free_track) {
+ print_track(free_track, "Freed");
pr_err("\n");
- free_track = kasan_get_free_track(cache, object, tag);
- if (free_track) {
- print_track(free_track, "Freed");
- pr_err("\n");
- }
+ }
#ifdef CONFIG_KASAN_GENERIC
- if (alloc_info->aux_stack[0]) {
- pr_err("Last potentially related work creation:\n");
- print_stack(alloc_info->aux_stack[0]);
- pr_err("\n");
- }
- if (alloc_info->aux_stack[1]) {
- pr_err("Second to last potentially related work creation:\n");
- print_stack(alloc_info->aux_stack[1]);
- pr_err("\n");
- }
-#endif
+ if (!alloc_meta)
+ return;
+ if (alloc_meta->aux_stack[0]) {
+ pr_err("Last potentially related work creation:\n");
+ print_stack(alloc_meta->aux_stack[0]);
+ pr_err("\n");
}
+ if (alloc_meta->aux_stack[1]) {
+ pr_err("Second to last potentially related work creation:\n");
+ print_stack(alloc_meta->aux_stack[1]);
+ pr_err("\n");
+ }
+#endif
+}
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr, u8 tag)
+{
+ if (kasan_stack_collection_enabled())
+ describe_object_stacks(cache, object, addr, tag);
describe_object_addr(cache, object, addr);
}
@@ -216,168 +223,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-static bool __must_check tokenize_frame_descr(const char **frame_descr,
- char *token, size_t max_tok_len,
- unsigned long *value)
-{
- const char *sep = strchr(*frame_descr, ' ');
-
- if (sep == NULL)
- sep = *frame_descr + strlen(*frame_descr);
-
- if (token != NULL) {
- const size_t tok_len = sep - *frame_descr;
-
- if (tok_len + 1 > max_tok_len) {
- pr_err("KASAN internal error: frame description too long: %s\n",
- *frame_descr);
- return false;
- }
-
- /* Copy token (+ 1 byte for '\0'). */
- strlcpy(token, *frame_descr, tok_len + 1);
- }
-
- /* Advance frame_descr past separator. */
- *frame_descr = sep + 1;
-
- if (value != NULL && kstrtoul(token, 10, value)) {
- pr_err("KASAN internal error: not a valid number: %s\n", token);
- return false;
- }
-
- return true;
-}
-
-static void print_decoded_frame_descr(const char *frame_descr)
-{
- /*
- * We need to parse the following string:
- * "n alloc_1 alloc_2 ... alloc_n"
- * where alloc_i looks like
- * "offset size len name"
- * or "offset size len name:line".
- */
-
- char token[64];
- unsigned long num_objects;
-
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &num_objects))
- return;
-
- pr_err("\n");
- pr_err("this frame has %lu %s:\n", num_objects,
- num_objects == 1 ? "object" : "objects");
-
- while (num_objects--) {
- unsigned long offset;
- unsigned long size;
-
- /* access offset */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &offset))
- return;
- /* access size */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &size))
- return;
- /* name length (unused) */
- if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
- return;
- /* object name */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- NULL))
- return;
-
- /* Strip line number; without filename it's not very helpful. */
- strreplace(token, ':', '\0');
-
- /* Finally, print object information. */
- pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
- }
-}
-
-static bool __must_check get_address_stack_frame_info(const void *addr,
- unsigned long *offset,
- const char **frame_descr,
- const void **frame_pc)
-{
- unsigned long aligned_addr;
- unsigned long mem_ptr;
- const u8 *shadow_bottom;
- const u8 *shadow_ptr;
- const unsigned long *frame;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
-
- /*
- * NOTE: We currently only support printing frame information for
- * accesses to the task's own stack.
- */
- if (!object_is_on_stack(addr))
- return false;
-
- aligned_addr = round_down((unsigned long)addr, sizeof(long));
- mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE);
- shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
- shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
-
- while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
- shadow_ptr--;
- mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
- }
-
- while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
- shadow_ptr--;
- mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
- }
-
- if (shadow_ptr < shadow_bottom)
- return false;
-
- frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE);
- if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
- pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
- frame[0]);
- return false;
- }
-
- *offset = (unsigned long)addr - (unsigned long)frame;
- *frame_descr = (const char *)frame[1];
- *frame_pc = (void *)frame[2];
-
- return true;
-}
-
-static void print_address_stack_frame(const void *addr)
-{
- unsigned long offset;
- const char *frame_descr;
- const void *frame_pc;
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- return;
-
- if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
- &frame_pc))
- return;
-
- /*
- * get_address_stack_frame_info only returns true if the given addr is
- * on the current task's stack.
- */
- pr_err("\n");
- pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
- addr, current->comm, task_pid_nr(current), offset);
- pr_err(" %pS\n", frame_pc);
-
- if (!frame_descr)
- return;
-
- print_decoded_frame_descr(frame_descr);
-}
-
static void print_address_description(void *addr, u8 tag)
{
struct page *page = kasan_addr_to_page(addr);
@@ -405,62 +250,68 @@ static void print_address_description(void *addr, u8 tag)
print_address_stack_frame(addr);
}
-static bool row_is_guilty(const void *row, const void *guilty)
+static bool meta_row_is_guilty(const void *row, const void *addr)
{
- return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+ return (row <= addr) && (addr < row + META_MEM_BYTES_PER_ROW);
}
-static int shadow_pointer_offset(const void *row, const void *shadow)
+static int meta_pointer_offset(const void *row, const void *addr)
{
- /* The length of ">ff00ff00ff00ff00: " is
- * 3 + (BITS_PER_LONG/8)*2 chars.
+ /*
+ * Memory state around the buggy address:
+ * ff00ff00ff00ff00: 00 00 00 05 fe fe fe fe fe fe fe fe fe fe fe fe
+ * ...
+ *
+ * The length of ">ff00ff00ff00ff00: " is
+ * 3 + (BITS_PER_LONG / 8) * 2 chars.
+ * The length of each granule metadata is 2 bytes
+ * plus 1 byte for space.
*/
- return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
- (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+ return 3 + (BITS_PER_LONG / 8) * 2 +
+ (addr - row) / KASAN_GRANULE_SIZE * 3 + 1;
}
-static void print_shadow_for_address(const void *addr)
+static void print_memory_metadata(const void *addr)
{
int i;
- const void *shadow = kasan_mem_to_shadow(addr);
- const void *shadow_row;
+ void *row;
- shadow_row = (void *)round_down((unsigned long)shadow,
- SHADOW_BYTES_PER_ROW)
- - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+ row = (void *)round_down((unsigned long)addr, META_MEM_BYTES_PER_ROW)
+ - META_ROWS_AROUND_ADDR * META_MEM_BYTES_PER_ROW;
pr_err("Memory state around the buggy address:\n");
- for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
- const void *kaddr = kasan_shadow_to_mem(shadow_row);
- char buffer[4 + (BITS_PER_LONG/8)*2];
- char shadow_buf[SHADOW_BYTES_PER_ROW];
+ for (i = -META_ROWS_AROUND_ADDR; i <= META_ROWS_AROUND_ADDR; i++) {
+ char buffer[4 + (BITS_PER_LONG / 8) * 2];
+ char metadata[META_BYTES_PER_ROW];
snprintf(buffer, sizeof(buffer),
- (i == 0) ? ">%px: " : " %px: ", kaddr);
+ (i == 0) ? ">%px: " : " %px: ", row);
+
/*
* We should not pass a shadow pointer to generic
* function, because generic functions may try to
* access kasan mapping for the passed address.
*/
- memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
+ metadata_fetch_row(&metadata[0], row);
+
print_hex_dump(KERN_ERR, buffer,
- DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
- shadow_buf, SHADOW_BYTES_PER_ROW, 0);
+ DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1,
+ metadata, META_BYTES_PER_ROW, 0);
- if (row_is_guilty(shadow_row, shadow))
- pr_err("%*c\n",
- shadow_pointer_offset(shadow_row, shadow),
- '^');
+ if (meta_row_is_guilty(row, addr))
+ pr_err("%*c\n", meta_pointer_offset(row, addr), '^');
- shadow_row += SHADOW_BYTES_PER_ROW;
+ row += META_MEM_BYTES_PER_ROW;
}
}
static bool report_enabled(void)
{
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (current->kasan_depth)
return false;
+#endif
if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
return true;
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
@@ -490,7 +341,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
unsigned long flags;
u8 tag = get_tag(object);
- object = reset_tag(object);
+ object = kasan_reset_tag(object);
#if IS_ENABLED(CONFIG_KUNIT)
if (current->kunit_test)
@@ -503,7 +354,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
pr_err("\n");
print_address_description(object, tag);
pr_err("\n");
- print_shadow_for_address(object);
+ print_memory_metadata(object);
end_report(&flags);
}
@@ -523,10 +374,10 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
disable_trace_on_warning();
tagged_addr = (void *)addr;
- untagged_addr = reset_tag(tagged_addr);
+ untagged_addr = kasan_reset_tag(tagged_addr);
info.access_addr = tagged_addr;
- if (addr_has_shadow(untagged_addr))
+ if (addr_has_metadata(untagged_addr))
info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
else
info.first_bad_addr = untagged_addr;
@@ -537,14 +388,14 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
start_report(&flags);
print_error_description(&info);
- if (addr_has_shadow(untagged_addr))
+ if (addr_has_metadata(untagged_addr))
print_tags(get_tag(tagged_addr), info.first_bad_addr);
pr_err("\n");
- if (addr_has_shadow(untagged_addr)) {
+ if (addr_has_metadata(untagged_addr)) {
print_address_description(untagged_addr, get_tag(tagged_addr));
pr_err("\n");
- print_shadow_for_address(info.first_bad_addr);
+ print_memory_metadata(info.first_bad_addr);
} else {
dump_stack();
}
@@ -604,6 +455,6 @@ void kasan_non_canonical_hook(unsigned long addr)
else
bug_type = "maybe wild-memory-access";
pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
- orig_addr, orig_addr + KASAN_SHADOW_MASK);
+ orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1);
}
#endif
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
new file mode 100644
index 000000000000..8a9c889872da
--- /dev/null
+++ b/mm/kasan/report_generic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ void *p = addr;
+
+ while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+ p += KASAN_GRANULE_SIZE;
+ return p;
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
+
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ /*
+ * If shadow byte value is in [0, KASAN_GRANULE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_GRANULE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_GRANULE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
+ case KASAN_GLOBAL_REDZONE:
+ bug_type = "global-out-of-bounds";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ case KASAN_KMALLOC_FREETRACK:
+ bug_type = "use-after-free";
+ break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
+ case KASAN_VMALLOC_INVALID:
+ bug_type = "vmalloc-out-of-bounds";
+ break;
+ }
+
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ if (addr_has_metadata(info->access_addr))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+void metadata_fetch_row(char *buffer, void *row)
+{
+ memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
+}
+
+#if CONFIG_KASAN_STACK
+static bool __must_check tokenize_frame_descr(const char **frame_descr,
+ char *token, size_t max_tok_len,
+ unsigned long *value)
+{
+ const char *sep = strchr(*frame_descr, ' ');
+
+ if (sep == NULL)
+ sep = *frame_descr + strlen(*frame_descr);
+
+ if (token != NULL) {
+ const size_t tok_len = sep - *frame_descr;
+
+ if (tok_len + 1 > max_tok_len) {
+ pr_err("KASAN internal error: frame description too long: %s\n",
+ *frame_descr);
+ return false;
+ }
+
+ /* Copy token (+ 1 byte for '\0'). */
+ strlcpy(token, *frame_descr, tok_len + 1);
+ }
+
+ /* Advance frame_descr past separator. */
+ *frame_descr = sep + 1;
+
+ if (value != NULL && kstrtoul(token, 10, value)) {
+ pr_err("KASAN internal error: not a valid number: %s\n", token);
+ return false;
+ }
+
+ return true;
+}
+
+static void print_decoded_frame_descr(const char *frame_descr)
+{
+ /*
+ * We need to parse the following string:
+ * "n alloc_1 alloc_2 ... alloc_n"
+ * where alloc_i looks like
+ * "offset size len name"
+ * or "offset size len name:line".
+ */
+
+ char token[64];
+ unsigned long num_objects;
+
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &num_objects))
+ return;
+
+ pr_err("\n");
+ pr_err("this frame has %lu %s:\n", num_objects,
+ num_objects == 1 ? "object" : "objects");
+
+ while (num_objects--) {
+ unsigned long offset;
+ unsigned long size;
+
+ /* access offset */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &offset))
+ return;
+ /* access size */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &size))
+ return;
+ /* name length (unused) */
+ if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
+ return;
+ /* object name */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ NULL))
+ return;
+
+ /* Strip line number; without filename it's not very helpful. */
+ strreplace(token, ':', '\0');
+
+ /* Finally, print object information. */
+ pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
+ }
+}
+
+static bool __must_check get_address_stack_frame_info(const void *addr,
+ unsigned long *offset,
+ const char **frame_descr,
+ const void **frame_pc)
+{
+ unsigned long aligned_addr;
+ unsigned long mem_ptr;
+ const u8 *shadow_bottom;
+ const u8 *shadow_ptr;
+ const unsigned long *frame;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
+
+ /*
+ * NOTE: We currently only support printing frame information for
+ * accesses to the task's own stack.
+ */
+ if (!object_is_on_stack(addr))
+ return false;
+
+ aligned_addr = round_down((unsigned long)addr, sizeof(long));
+ mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE);
+ shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
+ shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_GRANULE_SIZE;
+ }
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_GRANULE_SIZE;
+ }
+
+ if (shadow_ptr < shadow_bottom)
+ return false;
+
+ frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE);
+ if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
+ pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
+ frame[0]);
+ return false;
+ }
+
+ *offset = (unsigned long)addr - (unsigned long)frame;
+ *frame_descr = (const char *)frame[1];
+ *frame_pc = (void *)frame[2];
+
+ return true;
+}
+
+void print_address_stack_frame(const void *addr)
+{
+ unsigned long offset;
+ const char *frame_descr;
+ const void *frame_pc;
+
+ if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
+ &frame_pc))
+ return;
+
+ /*
+ * get_address_stack_frame_info only returns true if the given addr is
+ * on the current task's stack.
+ */
+ pr_err("\n");
+ pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
+ addr, current->comm, task_pid_nr(current), offset);
+ pr_err(" %pS\n", frame_pc);
+
+ if (!frame_descr)
+ return;
+
+ print_decoded_frame_descr(frame_descr);
+}
+#endif /* CONFIG_KASAN_STACK */
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
new file mode 100644
index 000000000000..57114f0e14d1
--- /dev/null
+++ b/mm/kasan/report_hw_tags.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains hardware tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ return kasan_reset_tag(addr);
+}
+
+void metadata_fetch_row(char *buffer, void *row)
+{
+ int i;
+
+ for (i = 0; i < META_BYTES_PER_ROW; i++)
+ buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+ u8 memory_tag = hw_get_mem_tag((void *)addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n",
+ addr_tag, memory_tag);
+}
diff --git a/mm/kasan/tags_report.c b/mm/kasan/report_sw_tags.c
index bee43717d6f0..1b026793ad57 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/report_sw_tags.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains tag-based KASAN specific error reporting code.
+ * This file contains software tag-based KASAN specific error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/bitops.h>
@@ -46,16 +41,19 @@ const char *get_bug_type(struct kasan_access_info *info)
int i;
tag = get_tag(info->access_addr);
- addr = reset_tag(info->access_addr);
+ addr = kasan_reset_tag(info->access_addr);
page = kasan_addr_to_page(addr);
if (page && PageSlab(page)) {
cache = page->slab_cache;
object = nearest_obj(cache, page, (void *)addr);
- alloc_meta = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
- if (alloc_meta->free_pointer_tag[i] == tag)
- return "use-after-free";
+ if (alloc_meta) {
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ }
+ }
return "out-of-bounds";
}
@@ -77,14 +75,19 @@ const char *get_bug_type(struct kasan_access_info *info)
void *find_first_bad_addr(void *addr, size_t size)
{
u8 tag = get_tag(addr);
- void *p = reset_tag(addr);
+ void *p = kasan_reset_tag(addr);
void *end = p + size;
while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
- p += KASAN_SHADOW_SCALE_SIZE;
+ p += KASAN_GRANULE_SIZE;
return p;
}
+void metadata_fetch_row(char *buffer, void *row)
+{
+ memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
+}
+
void print_tags(u8 addr_tag, const void *addr)
{
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
new file mode 100644
index 000000000000..e9efe88f7679
--- /dev/null
+++ b/mm/kasan/shadow.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains KASAN runtime code that manages shadow memory for
+ * generic and software tag-based KASAN modes.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/kmemleak.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "kasan.h"
+
+bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_read);
+
+bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+
+#ifdef __HAVE_ARCH_MEMMOVE
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+#endif
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_GRANULE_SIZE.
+ */
+void poison_range(const void *address, size_t size, u8 value)
+{
+ void *shadow_start, *shadow_end;
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_poison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = kasan_reset_tag(address);
+ size = round_up(size, KASAN_GRANULE_SIZE);
+
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(address))
+ return;
+
+ shadow_start = kasan_mem_to_shadow(address);
+ shadow_end = kasan_mem_to_shadow(address + size);
+
+ __memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void unpoison_range(const void *address, size_t size)
+{
+ u8 tag = get_tag(address);
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = kasan_reset_tag(address);
+
+ /*
+ * Skip KFENCE memory if called explicitly outside of sl*b. Also note
+ * that calls to ksize(), where size is not a multiple of machine-word
+ * size, would otherwise poison the invalid portion of the word.
+ */
+ if (is_kfence_address(address))
+ return;
+
+ poison_range(address, size, tag);
+
+ if (size & KASAN_GRANULE_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ *shadow = tag;
+ else /* CONFIG_KASAN_GENERIC */
+ *shadow = size & KASAN_GRANULE_MASK;
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_GRANULE_SIZE) ||
+ WARN_ON(start_kaddr % KASAN_MEMORY_PER_SHADOW_PAGE))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+ return 0;
+}
+
+core_initcall(kasan_memhotplug_init);
+#endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+
+static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+ pte_t pte;
+
+ if (likely(!pte_none(*ptep)))
+ return 0;
+
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pte_none(*ptep))) {
+ set_pte_at(&init_mm, addr, ptep, pte);
+ page = 0;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (page)
+ free_page(page);
+ return 0;
+}
+
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+{
+ unsigned long shadow_start, shadow_end;
+ int ret;
+
+ if (!is_vmalloc_or_module_addr((void *)addr))
+ return 0;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
+ shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
+ shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
+ shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+
+ ret = apply_to_page_range(&init_mm, shadow_start,
+ shadow_end - shadow_start,
+ kasan_populate_vmalloc_pte, NULL);
+ if (ret)
+ return ret;
+
+ flush_cache_vmap(shadow_start, shadow_end);
+
+ /*
+ * We need to be careful about inter-cpu effects here. Consider:
+ *
+ * CPU#0 CPU#1
+ * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
+ * p[99] = 1;
+ *
+ * With compiler instrumentation, that ends up looking like this:
+ *
+ * CPU#0 CPU#1
+ * // vmalloc() allocates memory
+ * // let a = area->addr
+ * // we reach kasan_populate_vmalloc
+ * // and call unpoison_range:
+ * STORE shadow(a), unpoison_val
+ * ...
+ * STORE shadow(a+99), unpoison_val x = LOAD p
+ * // rest of vmalloc process <data dependency>
+ * STORE p, a LOAD shadow(x+99)
+ *
+ * If there is no barrier between the end of unpoisioning the shadow
+ * and the store of the result to p, the stores could be committed
+ * in a different order by CPU#0, and CPU#1 could erroneously observe
+ * poison in the shadow.
+ *
+ * We need some sort of barrier between the stores.
+ *
+ * In the vmalloc() case, this is provided by a smp_wmb() in
+ * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
+ * get_vm_area() and friends, the caller gets shadow allocated but
+ * doesn't have any pages mapped into the virtual address space that
+ * has been reserved. Mapping those pages in will involve taking and
+ * releasing a page-table lock, which will provide the barrier.
+ */
+
+ return 0;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ poison_range(start, size, KASAN_VMALLOC_INVALID);
+}
+
+void kasan_unpoison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ unpoison_range(start, size);
+}
+
+static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+
+ page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+
+ spin_lock(&init_mm.page_table_lock);
+
+ if (likely(!pte_none(*ptep))) {
+ pte_clear(&init_mm, addr, ptep);
+ free_page(page);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Release the backing for the vmalloc region [start, end), which
+ * lies within the free region [free_region_start, free_region_end).
+ *
+ * This can be run lazily, long after the region was freed. It runs
+ * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
+ * infrastructure.
+ *
+ * How does this work?
+ * -------------------
+ *
+ * We have a region that is page aligned, labelled as A.
+ * That might not map onto the shadow in a way that is page-aligned:
+ *
+ * start end
+ * v v
+ * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |??AAAAAA|AAAAAAAA|AA??????| < shadow
+ * (1) (2) (3)
+ *
+ * First we align the start upwards and the end downwards, so that the
+ * shadow of the region aligns with shadow page boundaries. In the
+ * example, this gives us the shadow page (2). This is the shadow entirely
+ * covered by this allocation.
+ *
+ * Then we have the tricky bits. We want to know if we can free the
+ * partially covered shadow pages - (1) and (3) in the example. For this,
+ * we are given the start and end of the free region that contains this
+ * allocation. Extending our previous example, we could have:
+ *
+ * free_region_start free_region_end
+ * | start end |
+ * v v v v
+ * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
+ * (1) (2) (3)
+ *
+ * Once again, we align the start of the free region up, and the end of
+ * the free region down so that the shadow is page aligned. So we can free
+ * page (1) - we know no allocation currently uses anything in that page,
+ * because all of it is in the vmalloc free region. But we cannot free
+ * page (3), because we can't be sure that the rest of it is unused.
+ *
+ * We only consider pages that contain part of the original region for
+ * freeing: we don't try to free other pages from the free region or we'd
+ * end up trying to free huge chunks of virtual address space.
+ *
+ * Concurrency
+ * -----------
+ *
+ * How do we know that we're not freeing a page that is simultaneously
+ * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
+ *
+ * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
+ * at the same time. While we run under free_vmap_area_lock, the population
+ * code does not.
+ *
+ * free_vmap_area_lock instead operates to ensure that the larger range
+ * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
+ * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
+ * no space identified as free will become used while we are running. This
+ * means that so long as we are careful with alignment and only free shadow
+ * pages entirely covered by the free region, we will not run in to any
+ * trouble - any simultaneous allocations will be for disjoint regions.
+ */
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end)
+{
+ void *shadow_start, *shadow_end;
+ unsigned long region_start, region_end;
+ unsigned long size;
+
+ region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
+ region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ free_region_start = ALIGN(free_region_start, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ if (start != region_start &&
+ free_region_start < region_start)
+ region_start -= KASAN_MEMORY_PER_SHADOW_PAGE;
+
+ free_region_end = ALIGN_DOWN(free_region_end, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ if (end != region_end &&
+ free_region_end > region_end)
+ region_end += KASAN_MEMORY_PER_SHADOW_PAGE;
+
+ shadow_start = kasan_mem_to_shadow((void *)region_start);
+ shadow_end = kasan_mem_to_shadow((void *)region_end);
+
+ if (shadow_end > shadow_start) {
+ size = shadow_end - shadow_start;
+ apply_to_existing_page_range(&init_mm,
+ (unsigned long)shadow_start,
+ size, kasan_depopulate_vmalloc_pte,
+ NULL);
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
+ }
+}
+
+#else /* CONFIG_KASAN_VMALLOC */
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+ void *ret;
+ size_t scaled_size;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ scaled_size = (size + KASAN_GRANULE_SIZE - 1) >>
+ KASAN_SHADOW_SCALE_SHIFT;
+ shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+
+ if (ret) {
+ __memset(ret, KASAN_SHADOW_INIT, shadow_size);
+ find_vm_area(addr)->flags |= VM_KASAN;
+ kmemleak_ignore(ret);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+ if (vm->flags & VM_KASAN)
+ vfree(kasan_mem_to_shadow(vm->addr));
+}
+
+#endif
diff --git a/mm/kasan/tags.c b/mm/kasan/sw_tags.c
index e02a36a51f42..5dcd830805b2 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/sw_tags.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains core tag-based KASAN code.
+ * This file contains core software tag-based KASAN code.
*
* Copyright (c) 2018 Google, Inc.
* Author: Andrey Konovalov <andreyknvl@google.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define pr_fmt(fmt) "kasan: " fmt
#include <linux/export.h>
#include <linux/interrupt.h>
@@ -40,12 +35,14 @@
static DEFINE_PER_CPU(u32, prng_state);
-void kasan_init_tags(void)
+void __init kasan_init_sw_tags(void)
{
int cpu;
for_each_possible_cpu(cpu)
per_cpu(prng_state, cpu) = (u32)get_cycles();
+
+ pr_info("KernelAddressSanitizer initialized\n");
}
/*
@@ -70,11 +67,6 @@ u8 random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-void *kasan_reset_tag(const void *addr)
-{
- return reset_tag(addr);
-}
-
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
{
@@ -110,7 +102,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
if (tag == KASAN_TAG_KERNEL)
return true;
- untagged_addr = reset_tag((const void *)addr);
+ untagged_addr = kasan_reset_tag((const void *)addr);
if (unlikely(untagged_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
return !kasan_report(addr, size, write, ret_ip);
@@ -126,6 +118,15 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
return true;
}
+bool check_invalid_free(void *addr)
+{
+ u8 tag = get_tag(addr);
+ u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
+
+ return (shadow_byte == KASAN_TAG_INVALID) ||
+ (tag != KASAN_TAG_KERNEL && tag != shadow_byte);
+}
+
#define DEFINE_HWASAN_LOAD_STORE(size) \
void __hwasan_load##size##_noabort(unsigned long addr) \
{ \
@@ -158,7 +159,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- kasan_poison_shadow((void *)addr, size, tag);
+ poison_range((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
@@ -168,7 +169,9 @@ void kasan_set_free_info(struct kmem_cache *cache,
struct kasan_alloc_meta *alloc_meta;
u8 idx = 0;
- alloc_meta = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
idx = alloc_meta->free_track_idx;
@@ -185,7 +188,9 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
struct kasan_alloc_meta *alloc_meta;
int i = 0;
- alloc_meta = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return NULL;
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 34548e4fe0fe..494d3cb0b58a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1285,7 +1285,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* PTEs are armed with uffd write protection.
* Here we can also mark the new huge pmd as
* write protected if any of the small ones is
- * marked but that could bring uknown
+ * marked but that could bring unknown
* userfault messages that falls outside of
* the registered range. So, just be simple.
*/
diff --git a/mm/memblock.c b/mm/memblock.c
index 31898fc7d6de..b8b7be0561c4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -871,7 +871,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
* @base: base address of the region
* @size: size of the region
* @set: set or clear the flag
- * @flag: the flag to udpate
+ * @flag: the flag to update
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e3c7ca7dc174..afaa3195ed6f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1343,46 +1343,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
#endif
/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
- * @pgdat: pgdat of the page
- *
- * This function relies on page's memcg being stable - see the
- * access rules in commit_charge().
- */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
-{
- struct mem_cgroup_per_node *mz;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- if (mem_cgroup_disabled()) {
- lruvec = &pgdat->__lruvec;
- goto out;
- }
-
- memcg = page_memcg(page);
- /*
- * Swapcache readahead pages are added to the LRU - and
- * possibly migrated - before they are charged.
- */
- if (!memcg)
- memcg = root_mem_cgroup;
-
- mz = mem_cgroup_page_nodeinfo(memcg, page);
- lruvec = &mz->lruvec;
-out:
- /*
- * Since a node can be onlined after the mem_cgroup was created,
- * we have to be prepared to initialize lruvec->zone here;
- * and if offlined then reonlined, we need to reinitialize it.
- */
- if (unlikely(lruvec->pgdat != pgdat))
- lruvec->pgdat = pgdat;
- return lruvec;
-}
-
-/**
* lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page
*
@@ -2975,9 +2935,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp)
+ gfp_t gfp, bool new_page)
{
unsigned int objects = objs_per_slab_page(s, page);
+ unsigned long memcg_data;
void *vec;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
@@ -2985,11 +2946,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
if (!vec)
return -ENOMEM;
- if (!set_page_objcgs(page, vec))
+ memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
+ if (new_page) {
+ /*
+ * If the slab page is brand new and nobody can yet access
+ * it's memcg_data, no synchronization is required and
+ * memcg_data can be simply assigned.
+ */
+ page->memcg_data = memcg_data;
+ } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+ /*
+ * If the slab page is already in use, somebody can allocate
+ * and assign obj_cgroups in parallel. In this case the existing
+ * objcg vector should be reused.
+ */
kfree(vec);
- else
- kmemleak_not_leak(vec);
+ return 0;
+ }
+ kmemleak_not_leak(vec);
return 0;
}
@@ -6987,6 +6962,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
memcg = page_memcg(oldpage);
+ VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
if (!memcg)
return;
@@ -7178,12 +7154,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
+ if (mem_cgroup_disabled())
+ return;
+
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page_memcg(page);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return;
@@ -7242,12 +7221,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short oldid;
+ if (mem_cgroup_disabled())
+ return 0;
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page_memcg(page);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 4a42a74a2240..7d608765932b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4707,9 +4707,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
+ spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4774,32 +4774,6 @@ out:
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL,
- ptepp, NULL, ptlp)));
- return res;
-}
-
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, range,
- ptepp, pmdpp, ptlp)));
- return res;
-}
-EXPORT_SYMBOL(follow_pte_pmd);
-
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -4820,7 +4794,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
- ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+ ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
@@ -4841,7 +4815,7 @@ int follow_phys(struct vm_area_struct *vma,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
- if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
goto out;
pte = *ptep;
diff --git a/mm/mempool.c b/mm/mempool.c
index f473cdddaff0..624ed51b060f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_poison_kfree(element, _RET_IP_);
+ kasan_slab_free_mempool(element, _RET_IP_);
else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
@@ -112,7 +112,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_unpoison_slab(element);
+ kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index ee802cb509a3..ee5e612b4cd8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2594,7 +2594,7 @@ static bool migrate_vma_check_page(struct page *page)
* will bump the page reference count. Sadly there is no way to
* differentiate a regular pin from migration wait. Hence to
* avoid 2 racing thread trying to migrate back to CPU to enter
- * infinite loop (one stoping migration because the other is
+ * infinite loop (one stopping migration because the other is
* waiting on pte migration entry). We always return true here.
*
* FIXME proper solution is to rework migration_entry_wait() so
diff --git a/mm/mmap.c b/mm/mmap.c
index bb62a59ed92a..8144fc3c5a78 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1348,9 +1348,8 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-static inline int mlock_future_check(struct mm_struct *mm,
- unsigned long flags,
- unsigned long len)
+int mlock_future_check(struct mm_struct *mm, unsigned long flags,
+ unsigned long len)
{
unsigned long locked, lock_limit;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 139ab8fe5c72..774542e1483e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1204,8 +1204,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++)
+ for (i = 0; i < numpages; i++) {
+ page_kasan_tag_reset(page + i);
clear_highpage(page + i);
+ }
kasan_enable_current();
}
@@ -7656,6 +7658,11 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
* alias for the memset().
*/
direct_map_addr = page_address(page);
+ /*
+ * Perform a kasan-unchecked memset() since this memory
+ * has not been initialized.
+ */
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
if ((unsigned int)poison <= 0xFF)
memset(direct_map_addr, poison, PAGE_SIZE);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 16b161f28a31..df6f74aac8e1 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -34,7 +34,7 @@
*
* The need callback is used to decide whether extended memory allocation is
* needed or not. Sometimes users want to deactivate some features in this
- * boot and extra memory would be unneccessary. In this case, to avoid
+ * boot and extra memory would be unnecessary. In this case, to avoid
* allocating huge chunk of memory, each clients represent their need of
* extra memory through the need callback. If one of the need callbacks
* returns true, it means that someone needs extra memory so that
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 06ec518b2089..65cdf844c8ad 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -25,7 +25,7 @@ static void poison_page(struct page *page)
/* KASAN still think the page is in-use, so skip it. */
kasan_disable_current();
- memset(addr, PAGE_POISON, PAGE_SIZE);
+ memset(kasan_reset_tag(addr), PAGE_POISON, PAGE_SIZE);
kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..4354c1422d57 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,7 +4,7 @@
#include <linux/ptdump.h>
#include <linux/kasan.h>
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
* This is an optimization for KASAN=y case. Since all kasan page tables
* eventually point to the kasan_early_shadow_page we could call note_page()
@@ -31,7 +31,8 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
struct ptdump_state *st = walk->private;
pgd_t val = READ_ONCE(*pgd);
-#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 4 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
return note_kasan_page_table(walk, addr);
#endif
@@ -51,7 +52,8 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
struct ptdump_state *st = walk->private;
p4d_t val = READ_ONCE(*p4d);
-#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 3 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
return note_kasan_page_table(walk, addr);
#endif
@@ -71,7 +73,8 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
struct ptdump_state *st = walk->private;
pud_t val = READ_ONCE(*pud);
-#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 2 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
return note_kasan_page_table(walk, addr);
#endif
@@ -91,7 +94,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
struct ptdump_state *st = walk->private;
pmd_t val = READ_ONCE(*pmd);
-#if defined(CONFIG_KASAN)
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
return note_kasan_page_table(walk, addr);
#endif
diff --git a/mm/secretmem.c b/mm/secretmem.c
new file mode 100644
index 000000000000..b8a32954ac68
--- /dev/null
+++ b/mm/secretmem.c
@@ -0,0 +1,439 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2020
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/cma.h>
+#include <linux/mount.h>
+#include <linux/memfd.h>
+#include <linux/bitops.h>
+#include <linux/printk.h>
+#include <linux/pagemap.h>
+#include <linux/genalloc.h>
+#include <linux/syscalls.h>
+#include <linux/memblock.h>
+#include <linux/pseudo_fs.h>
+#include <linux/secretmem.h>
+#include <linux/memcontrol.h>
+#include <linux/set_memory.h>
+#include <linux/sched/signal.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "secretmem: " fmt
+
+/*
+ * Define mode and flag masks to allow validation of the system call
+ * parameters.
+ */
+#define SECRETMEM_MODE_MASK (0x0)
+#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
+
+struct secretmem_ctx {
+ struct gen_pool *pool;
+ unsigned int mode;
+};
+
+static struct cma *secretmem_cma;
+
+static atomic_t secretmem_users;
+
+bool secretmem_active(void)
+{
+ return !!atomic_read(&secretmem_users);
+}
+
+static int secretmem_account_pages(struct page *page, gfp_t gfp, int order)
+{
+ int err;
+
+ err = memcg_kmem_charge_page(page, gfp, order);
+ if (err)
+ return err;
+
+ /*
+ * seceremem caches are unreclaimable kernel allocations, so treat
+ * them as unreclaimable slab memory for VM statistics purposes
+ */
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
+
+ return 0;
+}
+
+static void secretmem_unaccount_pages(struct page *page, int order)
+{
+
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ -PAGE_SIZE << order);
+ memcg_kmem_uncharge_page(page, order);
+}
+
+static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
+{
+ unsigned long nr_pages = (1 << PMD_PAGE_ORDER);
+ struct gen_pool *pool = ctx->pool;
+ unsigned long addr;
+ struct page *page;
+ int err;
+
+ page = cma_alloc(secretmem_cma, nr_pages, PMD_SIZE, gfp & __GFP_NOWARN);
+ if (!page)
+ return -ENOMEM;
+
+ err = secretmem_account_pages(page, gfp, PMD_PAGE_ORDER);
+ if (err)
+ goto err_cma_release;
+
+ err = set_direct_map_invalid_noflush(page, nr_pages);
+ if (err)
+ goto err_memcg_uncharge;
+
+ addr = (unsigned long)page_address(page);
+ err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE);
+ if (err)
+ goto err_set_direct_map;
+
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+
+ return 0;
+
+err_set_direct_map:
+ /*
+ * If a split of PUD-size page was required, it already happened
+ * when we marked the pages invalid which guarantees that this call
+ * won't fail
+ */
+ set_direct_map_default_noflush(page, nr_pages);
+err_memcg_uncharge:
+ secretmem_unaccount_pages(page, PMD_PAGE_ORDER);
+err_cma_release:
+ cma_release(secretmem_cma, page, nr_pages);
+ return err;
+}
+
+static struct page *secretmem_alloc_page(struct secretmem_ctx *ctx,
+ gfp_t gfp)
+{
+ struct gen_pool *pool = ctx->pool;
+ unsigned long addr;
+ struct page *page;
+ int err;
+
+ if (gen_pool_avail(pool) < PAGE_SIZE) {
+ err = secretmem_pool_increase(ctx, gfp);
+ if (err)
+ return NULL;
+ }
+
+ addr = gen_pool_alloc(pool, PAGE_SIZE);
+ if (!addr)
+ return NULL;
+
+ page = virt_to_page(addr);
+ get_page(page);
+
+ return page;
+}
+
+static vm_fault_t secretmem_fault(struct vm_fault *vmf)
+{
+ struct secretmem_ctx *ctx = vmf->vma->vm_file->private_data;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ pgoff_t offset = vmf->pgoff;
+ vm_fault_t ret = 0;
+ struct page *page;
+ int err;
+
+ if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+ return vmf_error(-EINVAL);
+
+ page = find_get_page(mapping, offset);
+ if (!page) {
+ page = secretmem_alloc_page(ctx, vmf->gfp_mask);
+ if (!page)
+ return vmf_error(-ENOMEM);
+
+ err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask);
+ if (unlikely(err))
+ goto err_put_page;
+
+ __SetPageUptodate(page);
+ set_page_private(page, (unsigned long)ctx);
+
+ ret = VM_FAULT_LOCKED;
+ }
+
+ vmf->page = page;
+ return ret;
+
+err_put_page:
+ put_page(page);
+ return vmf_error(err);
+}
+
+static const struct vm_operations_struct secretmem_vm_ops = {
+ .fault = secretmem_fault,
+};
+
+static int secretmem_release(struct inode *inode, struct file *file)
+{
+ atomic_dec(&secretmem_users);
+ return 0;
+}
+
+static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long len = vma->vm_end - vma->vm_start;
+
+ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ return -EINVAL;
+
+ if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ return -EAGAIN;
+
+ vma->vm_ops = &secretmem_vm_ops;
+ vma->vm_flags |= VM_LOCKED;
+
+ return 0;
+}
+
+bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &secretmem_vm_ops;
+}
+
+static const struct file_operations secretmem_fops = {
+ .release = secretmem_release,
+ .mmap = secretmem_mmap,
+};
+
+static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
+{
+ return false;
+}
+
+static int secretmem_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ return -EBUSY;
+}
+
+static void secretmem_freepage(struct page *page)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+ struct secretmem_ctx *ctx = (struct secretmem_ctx *)page_private(page);
+ struct gen_pool *pool = ctx->pool;
+
+ gen_pool_free(pool, addr, PAGE_SIZE);
+}
+
+static const struct address_space_operations secretmem_aops = {
+ .freepage = secretmem_freepage,
+ .migratepage = secretmem_migratepage,
+ .isolate_page = secretmem_isolate_page,
+};
+
+bool page_is_secretmem(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (!mapping)
+ return false;
+
+ return mapping->a_ops == &secretmem_aops;
+}
+
+static struct vfsmount *secretmem_mnt;
+
+static struct file *secretmem_file_create(unsigned long flags)
+{
+ struct file *file = ERR_PTR(-ENOMEM);
+ struct secretmem_ctx *ctx;
+ struct inode *inode;
+
+ inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ goto err_free_inode;
+
+ ctx->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+ if (!ctx->pool)
+ goto err_free_ctx;
+
+ file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
+ O_RDWR, &secretmem_fops);
+ if (IS_ERR(file))
+ goto err_free_pool;
+
+ mapping_set_unevictable(inode->i_mapping);
+
+ inode->i_private = ctx;
+ inode->i_mapping->private_data = ctx;
+ inode->i_mapping->a_ops = &secretmem_aops;
+
+ /* pretend we are a normal file with zero size */
+ inode->i_mode |= S_IFREG;
+ inode->i_size = 0;
+
+ file->private_data = ctx;
+
+ ctx->mode = flags & SECRETMEM_MODE_MASK;
+
+ return file;
+
+err_free_pool:
+ gen_pool_destroy(ctx->pool);
+err_free_ctx:
+ kfree(ctx);
+err_free_inode:
+ iput(inode);
+ return file;
+}
+
+SYSCALL_DEFINE1(memfd_secret, unsigned long, flags)
+{
+ struct file *file;
+ int fd, err;
+
+ /* make sure local flags do not confict with global fcntl.h */
+ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
+
+ if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
+ return -EINVAL;
+
+ if (!secretmem_cma)
+ return -ENOMEM;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ file = secretmem_file_create(flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+
+ fd_install(fd, file);
+ atomic_inc(&secretmem_users);
+ return fd;
+
+err_put_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+static void secretmem_cleanup_chunk(struct gen_pool *pool,
+ struct gen_pool_chunk *chunk, void *data)
+{
+ unsigned long start = chunk->start_addr;
+ unsigned long end = chunk->end_addr;
+ struct page *page = virt_to_page(start);
+ unsigned long nr_pages = (end - start + 1) / PAGE_SIZE;
+ int i;
+
+ set_direct_map_default_noflush(page, nr_pages);
+ secretmem_unaccount_pages(page, PMD_PAGE_ORDER);
+
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(page + i);
+
+ cma_release(secretmem_cma, page, nr_pages);
+}
+
+static void secretmem_cleanup_pool(struct secretmem_ctx *ctx)
+{
+ struct gen_pool *pool = ctx->pool;
+
+ gen_pool_for_each_chunk(pool, secretmem_cleanup_chunk, ctx);
+ gen_pool_destroy(pool);
+}
+
+static void secretmem_evict_inode(struct inode *inode)
+{
+ struct secretmem_ctx *ctx = inode->i_private;
+
+ truncate_inode_pages_final(&inode->i_data);
+ secretmem_cleanup_pool(ctx);
+ clear_inode(inode);
+ kfree(ctx);
+}
+
+static const struct super_operations secretmem_super_ops = {
+ .evict_inode = secretmem_evict_inode,
+};
+
+static int secretmem_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx = init_pseudo(fc, SECRETMEM_MAGIC);
+
+ if (!ctx)
+ return -ENOMEM;
+ ctx->ops = &secretmem_super_ops;
+
+ return 0;
+}
+
+static struct file_system_type secretmem_fs = {
+ .name = "secretmem",
+ .init_fs_context = secretmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int secretmem_init(void)
+{
+ int ret = 0;
+
+ secretmem_mnt = kern_mount(&secretmem_fs);
+ if (IS_ERR(secretmem_mnt))
+ ret = PTR_ERR(secretmem_mnt);
+
+ return ret;
+}
+fs_initcall(secretmem_init);
+
+static int __init secretmem_setup(char *str)
+{
+ phys_addr_t align = PMD_SIZE;
+ unsigned long reserved_size;
+ int err;
+
+ if (!can_set_direct_map())
+ return 0;
+
+ reserved_size = memparse(str, NULL);
+ if (!reserved_size)
+ return 0;
+
+ if (reserved_size * 2 > PUD_SIZE)
+ align = PUD_SIZE;
+
+ err = cma_declare_contiguous(0, reserved_size, 0, align, 0, false,
+ "secretmem", &secretmem_cma);
+ if (err) {
+ pr_err("failed to create CMA: %d\n", err);
+ return err;
+ }
+
+ pr_info("reserved %luM\n", reserved_size >> 20);
+
+ return 0;
+}
+__setup("secretmem=", secretmem_setup);
diff --git a/mm/slab.c b/mm/slab.c
index 6a0bf3178f8e..e8298ae6acf7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1380,7 +1380,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- account_slab_page(page, cachep->gfporder, cachep);
+ account_slab_page(page, cachep->gfporder, cachep, flags);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
diff --git a/mm/slab.h b/mm/slab.h
index 1a756a359fa8..1e10bfd75659 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -240,7 +240,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp);
+ gfp_t gfp, bool new_page);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
@@ -317,7 +317,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
page = virt_to_head_page(p[i]);
if (!page_objcgs(page) &&
- memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ memcg_alloc_page_obj_cgroups(page, s, flags,
+ false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
@@ -381,7 +382,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
}
static inline int memcg_alloc_page_obj_cgroups(struct page *page,
- struct kmem_cache *s, gfp_t gfp)
+ struct kmem_cache *s, gfp_t gfp,
+ bool new_page)
{
return 0;
}
@@ -422,8 +424,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
}
static __always_inline void account_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+ struct kmem_cache *s,
+ gfp_t gfp)
{
+ if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
+ memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 479d17b90155..075b23ce94ec 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -19,6 +19,7 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
+#include <linux/kasan.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -54,7 +55,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | SLAB_KASAN)
+ SLAB_FAILSLAB | kasan_never_merge())
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -1179,7 +1180,7 @@ size_t ksize(const void *objp)
* We assume that ksize callers could use whole allocated area,
* so we need to unpoison this area.
*/
- kasan_unpoison_shadow(objp, size);
+ kasan_unpoison_range(objp, size);
return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/slub.c b/mm/slub.c
index cecfda96c53b..7abc5971ccac 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -250,7 +250,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
/*
- * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
* Normally, this doesn't cause any issues, as both set_freepointer()
* and get_freepointer() are called with a pointer with the same tag.
* However, there are some issues with CONFIG_SLUB_DEBUG code. For
@@ -276,6 +276,7 @@ static inline void *freelist_dereference(const struct kmem_cache *s,
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
+ object = kasan_reset_tag(object);
return freelist_dereference(s, object + s->offset);
}
@@ -305,6 +306,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif
+ freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}
@@ -539,8 +541,8 @@ static void print_section(char *level, char *text, u8 *addr,
unsigned int length)
{
metadata_access_enable();
- print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
- length, 1);
+ print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS,
+ 16, 1, addr, length, 1);
metadata_access_disable();
}
@@ -571,7 +573,7 @@ static struct track *get_track(struct kmem_cache *s, void *object,
p = object + get_info_end(s);
- return p + alloc;
+ return kasan_reset_tag(p + alloc);
}
static void set_track(struct kmem_cache *s, void *object,
@@ -584,7 +586,8 @@ static void set_track(struct kmem_cache *s, void *object,
unsigned int nr_entries;
metadata_access_enable();
- nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
+ nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
+ TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
if (nr_entries < TRACK_ADDRS_COUNT)
@@ -748,7 +751,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
- u8 *p = object;
+ u8 *p = kasan_reset_tag(object);
if (s->flags & SLAB_RED_ZONE)
memset(p - s->red_left_pad, val, s->red_left_pad);
@@ -778,7 +781,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *addr = page_address(page);
metadata_access_enable();
- fault = memchr_inv(start, value, bytes);
+ fault = memchr_inv(kasan_reset_tag(start), value, bytes);
metadata_access_disable();
if (!fault)
return 1;
@@ -874,7 +877,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
pad = end - remainder;
metadata_access_enable();
- fault = memchr_inv(pad, POISON_INUSE, remainder);
+ fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
return 1;
@@ -1119,7 +1122,7 @@ void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, page_size(page));
+ memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
metadata_access_disable();
}
@@ -1572,10 +1575,10 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
* Clear the object and the metadata, but don't touch
* the redzone.
*/
- memset(object, 0, s->object_size);
+ memset(kasan_reset_tag(object), 0, s->object_size);
rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
: 0;
- memset((char *)object + s->inuse, 0,
+ memset((char *)kasan_reset_tag(object) + s->inuse, 0,
s->size - s->inuse - rsize);
}
@@ -1622,9 +1625,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
else
page = __alloc_pages_node(node, flags, order);
- if (page)
- account_slab_page(page, order, s);
-
return page;
}
@@ -1777,6 +1777,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->objects = oo_objects(oo);
+ account_slab_page(page, oo_order(oo), s, flags);
+
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
@@ -2892,10 +2894,10 @@ redo:
stat(s, ALLOC_FASTPATH);
}
- maybe_wipe_obj_freeptr(s, object);
+ maybe_wipe_obj_freeptr(s, kasan_reset_tag(object));
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(object, 0, s->object_size);
+ memset(kasan_reset_tag(object), 0, s->object_size);
out:
slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4d88fe5a277a..d5b262d53622 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2195,13 +2195,14 @@ struct vm_struct *remove_vm_area(const void *addr)
}
static inline void set_area_direct_map(const struct vm_struct *area,
- int (*set_direct_map)(struct page *page))
+ int (*set_direct_map)(struct page *page,
+ int numpages))
{
int i;
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
- set_direct_map(area->pages[i]);
+ set_direct_map(area->pages[i], 1);
}
/* Handle removing and resetting vm mappings related to the vm_struct. */
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 94133708889d..213677a5ed33 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -148,10 +148,12 @@ endif
# we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
#
ifeq ($(CONFIG_KASAN),y)
+ifneq ($(CONFIG_KASAN_HW_TAGS),y)
_c_flags += $(if $(patsubst n%,, \
$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
$(CFLAGS_KASAN), $(CFLAGS_KASAN_NOSANITIZE))
endif
+endif
ifeq ($(CONFIG_UBSAN),y)
_c_flags += $(if $(patsubst n%,, \
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index a18b47695f55..b7609958ee36 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -40,6 +40,10 @@ cat << EOF
#define __IGNORE_setrlimit /* setrlimit */
#endif
+#ifndef __ARCH_WANT_MEMFD_SECRET
+#define __IGNORE_memfd_secret
+#endif
+
/* Missing flags argument */
#define __IGNORE_renameat /* renameat2 */
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 5fd4a64e431f..f95c6bfa8b8e 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -2046,9 +2046,6 @@ fail2:
return error;
}
-
-#define list_entry_is_head(pos, head, member) (&pos->member == (head))
-
/**
* __next_ns - find the next namespace to list
* @root: root namespace to stop search at (NOT NULL)
diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile
index 2480226dfe57..0b3af552632a 100644
--- a/tools/testing/selftests/arm64/mte/Makefile
+++ b/tools/testing/selftests/arm64/mte/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2020 ARM Limited
-CFLAGS += -std=gnu99 -I.
+CFLAGS += -std=gnu99 -I. -lpthread
SRCS := $(filter-out mte_common_util.c,$(wildcard *.c))
PROGS := $(patsubst %.c,%,$(SRCS))
diff --git a/tools/testing/selftests/arm64/mte/check_gcr_el1_cswitch.c b/tools/testing/selftests/arm64/mte/check_gcr_el1_cswitch.c
new file mode 100644
index 000000000000..de5066aca097
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_gcr_el1_cswitch.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+
+#define PR_SET_TAGGED_ADDR_CTRL 55
+#define PR_GET_TAGGED_ADDR_CTRL 56
+# define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+# define PR_MTE_TCF_SHIFT 1
+# define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TAG_SHIFT 3
+# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT)
+
+#include "mte_def.h"
+
+#define NUM_ITERATIONS 1024
+#define MAX_THREADS 5
+#define THREAD_ITERATIONS 1000
+
+void *execute_thread(void *x)
+{
+ pid_t pid = *((pid_t *)x);
+ pid_t tid = gettid();
+ uint64_t prctl_tag_mask;
+ uint64_t prctl_set;
+ uint64_t prctl_get;
+ uint64_t prctl_tcf;
+
+ srand(time(NULL) ^ (pid << 16) ^ (tid << 16));
+
+ prctl_tag_mask = rand() & 0xffff;
+
+ if (prctl_tag_mask % 2)
+ prctl_tcf = PR_MTE_TCF_SYNC;
+ else
+ prctl_tcf = PR_MTE_TCF_ASYNC;
+
+ prctl_set = PR_TAGGED_ADDR_ENABLE | prctl_tcf | (prctl_tag_mask << PR_MTE_TAG_SHIFT);
+
+ for (int j = 0; j < THREAD_ITERATIONS; j++) {
+ if (prctl(PR_SET_TAGGED_ADDR_CTRL, prctl_set, 0, 0, 0)) {
+ perror("prctl() failed");
+ goto fail;
+ }
+
+ prctl_get = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
+
+ if (prctl_set != prctl_get) {
+ ksft_print_msg("Error: prctl_set: 0x%lx != prctl_get: 0x%lx\n",
+ prctl_set, prctl_get);
+ goto fail;
+ }
+ }
+
+ return (void *)KSFT_PASS;
+
+fail:
+ return (void *)KSFT_FAIL;
+}
+
+int execute_test(pid_t pid)
+{
+ pthread_t thread_id[MAX_THREADS];
+ int thread_data[MAX_THREADS];
+
+ for (int i = 0; i < MAX_THREADS; i++)
+ pthread_create(&thread_id[i], NULL,
+ execute_thread, (void *)&pid);
+
+ for (int i = 0; i < MAX_THREADS; i++)
+ pthread_join(thread_id[i], (void *)&thread_data[i]);
+
+ for (int i = 0; i < MAX_THREADS; i++)
+ if (thread_data[i] == KSFT_FAIL)
+ return KSFT_FAIL;
+
+ return KSFT_PASS;
+}
+
+int mte_gcr_fork_test(void)
+{
+ pid_t pid;
+ int results[NUM_ITERATIONS];
+ pid_t cpid;
+ int res;
+
+ for (int i = 0; i < NUM_ITERATIONS; i++) {
+ pid = fork();
+
+ if (pid < 0)
+ return KSFT_FAIL;
+
+ if (pid == 0) {
+ cpid = getpid();
+
+ res = execute_test(cpid);
+
+ exit(res);
+ }
+ }
+
+ for (int i = 0; i < NUM_ITERATIONS; i++) {
+ wait(&res);
+
+ if (WIFEXITED(res))
+ results[i] = WEXITSTATUS(res);
+ else
+ --i;
+ }
+
+ for (int i = 0; i < NUM_ITERATIONS; i++)
+ if (results[i] == KSFT_FAIL)
+ return KSFT_FAIL;
+
+ return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+
+ ksft_set_plan(1);
+
+ evaluate_test(mte_gcr_fork_test(),
+ "Verify that GCR_EL1 is set correctly on context switch\n");
+
+ mte_restore_setup();
+ ksft_print_cnts();
+
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
+
diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
index 8f82f99f7748..ad7fabd575f9 100644
--- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
+#include <asm/unistd.h>
+#include <linux/time_types.h>
#include <poll.h>
#include <unistd.h>
#include <assert.h>
@@ -21,6 +23,19 @@ struct epoll_mtcontext
pthread_t waiter;
};
+#ifndef __NR_epoll_pwait2
+#define __NR_epoll_pwait2 -1
+#endif
+
+static inline int sys_epoll_pwait2(int fd, struct epoll_event *events,
+ int maxevents,
+ const struct __kernel_timespec *timeout,
+ const sigset_t *sigset, size_t sigsetsize)
+{
+ return syscall(__NR_epoll_pwait2, fd, events, maxevents, timeout,
+ sigset, sigsetsize);
+}
+
static void signal_handler(int signum)
{
}
@@ -3377,4 +3392,61 @@ TEST(epoll61)
close(ctx.evfd);
}
+/* Equivalent to basic test epoll1, but exercising epoll_pwait2. */
+TEST(epoll62)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/* Epoll_pwait2 basic timeout test. */
+TEST(epoll63)
+{
+ const int cfg_delay_ms = 10;
+ unsigned long long tdiff;
+ struct __kernel_timespec ts;
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = cfg_delay_ms * 1000 * 1000;
+
+ tdiff = msecs();
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, &ts, NULL, 0), 0);
+ tdiff = msecs() - tdiff;
+
+ EXPECT_GE(tdiff, cfg_delay_ms);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 9a35c3f6a557..c8deddc81e7a 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -21,4 +21,5 @@ va_128TBswitch
map_fixed_noreplace
write_to_hugetlbfs
hmm-tests
+memfd_secret
local_config.*
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 9a25307f6115..9af4a8dbe45c 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -34,6 +34,7 @@ TEST_GEN_FILES += khugepaged
TEST_GEN_FILES += map_fixed_noreplace
TEST_GEN_FILES += map_hugetlb
TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += memfd_secret
TEST_GEN_FILES += mlock-random-test
TEST_GEN_FILES += mlock2-tests
TEST_GEN_FILES += mremap_dontunmap
@@ -133,7 +134,7 @@ warn_32bit_failure:
endif
endif
-$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
+$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
$(OUTPUT)/gup_test: ../../../../mm/gup_test.h
diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c
new file mode 100644
index 000000000000..c878c2b841fc
--- /dev/null
+++ b/tools/testing/selftests/vm/memfd_secret.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2020
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include "../kselftest.h"
+
+#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__)
+#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
+#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
+
+#ifdef __NR_memfd_secret
+
+#define PATTERN 0x55
+
+static const int prot = PROT_READ | PROT_WRITE;
+static const int mode = MAP_SHARED;
+
+static unsigned long page_size;
+static unsigned long mlock_limit_cur;
+static unsigned long mlock_limit_max;
+
+static int memfd_secret(unsigned long flags)
+{
+ return syscall(__NR_memfd_secret, flags);
+}
+
+static void test_file_apis(int fd)
+{
+ char buf[64];
+
+ if ((read(fd, buf, sizeof(buf)) >= 0) ||
+ (write(fd, buf, sizeof(buf)) >= 0) ||
+ (pread(fd, buf, sizeof(buf), 0) >= 0) ||
+ (pwrite(fd, buf, sizeof(buf), 0) >= 0))
+ fail("unexpected file IO\n");
+ else
+ pass("file IO is blocked as expected\n");
+}
+
+static void test_mlock_limit(int fd)
+{
+ size_t len;
+ char *mem;
+
+ len = mlock_limit_cur;
+ mem = mmap(NULL, len, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("unable to mmap secret memory\n");
+ return;
+ }
+ munmap(mem, len);
+
+ len = mlock_limit_max * 2;
+ mem = mmap(NULL, len, prot, mode, fd, 0);
+ if (mem != MAP_FAILED) {
+ fail("unexpected mlock limit violation\n");
+ munmap(mem, len);
+ return;
+ }
+
+ pass("mlock limit is respected\n");
+}
+
+static void try_process_vm_read(int fd, int pipefd[2])
+{
+ struct iovec liov, riov;
+ char buf[64];
+ char *mem;
+
+ if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+ fail("pipe write: %s\n", strerror(errno));
+ exit(KSFT_FAIL);
+ }
+
+ liov.iov_len = riov.iov_len = sizeof(buf);
+ liov.iov_base = buf;
+ riov.iov_base = mem;
+
+ if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) {
+ if (errno == ENOSYS)
+ exit(KSFT_SKIP);
+ exit(KSFT_PASS);
+ }
+
+ exit(KSFT_FAIL);
+}
+
+static void try_ptrace(int fd, int pipefd[2])
+{
+ pid_t ppid = getppid();
+ int status;
+ char *mem;
+ long ret;
+
+ if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+ perror("pipe write");
+ exit(KSFT_FAIL);
+ }
+
+ ret = ptrace(PTRACE_ATTACH, ppid, 0, 0);
+ if (ret) {
+ perror("ptrace_attach");
+ exit(KSFT_FAIL);
+ }
+
+ ret = waitpid(ppid, &status, WUNTRACED);
+ if ((ret != ppid) || !(WIFSTOPPED(status))) {
+ fprintf(stderr, "weird waitppid result %ld stat %x\n",
+ ret, status);
+ exit(KSFT_FAIL);
+ }
+
+ if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0))
+ exit(KSFT_PASS);
+
+ exit(KSFT_FAIL);
+}
+
+static void check_child_status(pid_t pid, const char *name)
+{
+ int status;
+
+ waitpid(pid, &status, 0);
+
+ if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) {
+ skip("%s is not supported\n", name);
+ return;
+ }
+
+ if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) ||
+ WIFSIGNALED(status)) {
+ pass("%s is blocked as expected\n", name);
+ return;
+ }
+
+ fail("%s: unexpected memory access\n", name);
+}
+
+static void test_remote_access(int fd, const char *name,
+ void (*func)(int fd, int pipefd[2]))
+{
+ int pipefd[2];
+ pid_t pid;
+ char *mem;
+
+ if (pipe(pipefd)) {
+ fail("pipe failed: %s\n", strerror(errno));
+ return;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ fail("fork failed: %s\n", strerror(errno));
+ return;
+ }
+
+ if (pid == 0) {
+ func(fd, pipefd);
+ return;
+ }
+
+ mem = mmap(NULL, page_size, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("Unable to mmap secret memory\n");
+ return;
+ }
+
+ ftruncate(fd, page_size);
+ memset(mem, PATTERN, page_size);
+
+ if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
+ fail("pipe write: %s\n", strerror(errno));
+ return;
+ }
+
+ check_child_status(pid, name);
+}
+
+static void test_process_vm_read(int fd)
+{
+ test_remote_access(fd, "process_vm_read", try_process_vm_read);
+}
+
+static void test_ptrace(int fd)
+{
+ test_remote_access(fd, "ptrace", try_ptrace);
+}
+
+static int set_cap_limits(rlim_t max)
+{
+ struct rlimit new;
+ cap_t cap = cap_init();
+
+ new.rlim_cur = max;
+ new.rlim_max = max;
+ if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+ perror("setrlimit() returns error");
+ return -1;
+ }
+
+ /* drop capabilities including CAP_IPC_LOCK */
+ if (cap_set_proc(cap)) {
+ perror("cap_set_proc() returns error");
+ return -2;
+ }
+
+ return 0;
+}
+
+static void prepare(void)
+{
+ struct rlimit rlim;
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if (!page_size)
+ ksft_exit_fail_msg("Failed to get page size %s\n",
+ strerror(errno));
+
+ if (getrlimit(RLIMIT_MEMLOCK, &rlim))
+ ksft_exit_fail_msg("Unable to detect mlock limit: %s\n",
+ strerror(errno));
+
+ mlock_limit_cur = rlim.rlim_cur;
+ mlock_limit_max = rlim.rlim_max;
+
+ printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n",
+ page_size, mlock_limit_cur, mlock_limit_max);
+
+ if (page_size > mlock_limit_cur)
+ mlock_limit_cur = page_size;
+ if (page_size > mlock_limit_max)
+ mlock_limit_max = page_size;
+
+ if (set_cap_limits(mlock_limit_max))
+ ksft_exit_fail_msg("Unable to set mlock limit: %s\n",
+ strerror(errno));
+}
+
+#define NUM_TESTS 4
+
+int main(int argc, char *argv[])
+{
+ int fd;
+
+ prepare();
+
+ ksft_print_header();
+ ksft_set_plan(NUM_TESTS);
+
+ fd = memfd_secret(0);
+ if (fd < 0) {
+ if (errno == ENOSYS)
+ ksft_exit_skip("memfd_secret is not supported\n");
+ else
+ ksft_exit_fail_msg("memfd_secret failed: %s\n",
+ strerror(errno));
+ }
+
+ test_mlock_limit(fd);
+ test_file_apis(fd);
+ test_process_vm_read(fd);
+ test_ptrace(fd);
+
+ close(fd);
+
+ ksft_exit(!ksft_get_fail_cnt());
+}
+
+#else /* __NR_memfd_secret */
+
+int main(int argc, char *argv[])
+{
+ printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index e953f3cd9664..95a67382f132 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -346,4 +346,21 @@ else
exitcode=1
fi
+echo "running memfd_secret test"
+echo "------------------------------------"
+./memfd_secret
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+exit $exitcode
+
exit $exitcode
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index e2c197fd4f9d..62bd908ecd58 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -111,7 +111,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
{
struct page *page;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page)
return -ENOMEM;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3abcb2ce5b7d..5f260488e999 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3156,7 +3156,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
}
BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto vcpu_free;