From f3a53a3a1e5b3b9bc114b5b7930fbe89d9ffed5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:35 -0700 Subject: mm, memcontrol: implement memory.swap.events Add swap max and fail events so that userland can monitor and respond to running out of swap. I'm not too sure about the fail event. Right now, it's a bit confusing which stats / events are recursive and which aren't and also which ones reflect events which originate from a given cgroup and which targets the cgroup. No idea what the right long term solution is and it could just be that growing them organically is actually the only right thing to do. Link: http://lkml.kernel.org/r/20180416231151.GI1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 74cdeaed9f7a..b0dda10b9382 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1199,6 +1199,22 @@ PAGE_SIZE multiple when read back. Swap usage hard limit. If a cgroup's swap usage reaches this limit, anonymous memory of the cgroup will not be swapped out. + memory.swap.events + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + max + The number of times the cgroup's swap usage was about + to go over the max boundary and swap allocation + failed. + + fail + The number of times swap allocation failed either + because of running out of swap system-wide or max + limit. + Usage Guidelines ~~~~~~~~~~~~~~~~ -- cgit v1.2.3 From 89e85bce4b02edb7408aebf69d5d1a6692a05f4f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:42 -0700 Subject: zram: mark incompressible page as ZRAM_HUGE Mark incompressible pages so that we could investigate who is the owner of the incompressible pages once the page is swapped out via using upcoming zram memory tracker feature. With it, we could prevent such pages to be swapped out by using mlock. Otherwise we might remove them. This patch exposes new stat for huge pages via mm_stat. Link: http://lkml.kernel.org/r/20180416090946.63057-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 1 + drivers/block/zram/zram_drv.c | 17 ++++++++++++++--- drivers/block/zram/zram_drv.h | 2 ++ 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 257e65714c6a..78db38d02bc9 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 18dadeab775b..777fb3339f59 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -729,14 +729,15 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + (u64)atomic64_read(&zram->stats.huge_pages)); up_read(&zram->init_lock); return ret; @@ -805,6 +806,11 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_clear_flag(zram, index, ZRAM_HUGE); + atomic64_dec(&zram->stats.huge_pages); + } + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { zram_wb_clear(zram, index); atomic64_dec(&zram->stats.pages_stored); @@ -973,6 +979,7 @@ compress_again: } if (unlikely(comp_len >= huge_class_size)) { + comp_len = PAGE_SIZE; if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); @@ -984,7 +991,6 @@ compress_again: allow_wb = false; goto compress_again; } - comp_len = PAGE_SIZE; } /* @@ -1046,6 +1052,11 @@ out: zram_slot_lock(zram, index); zram_free_page(zram, index); + if (comp_len == PAGE_SIZE) { + zram_set_flag(zram, index, ZRAM_HUGE); + atomic64_inc(&zram->stats.huge_pages); + } + if (flags) { zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8d8959ceabd1..ff0547bdb586 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -47,6 +47,7 @@ enum zram_pageflags { ZRAM_LOCK = ZRAM_FLAG_SHIFT, ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_HUGE, /* Incompressible page */ __NR_ZRAM_PAGEFLAGS, }; @@ -71,6 +72,7 @@ struct zram_stats { atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ + atomic64_t huge_pages; /* no. of huge pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ -- cgit v1.2.3 From c0265342bff4fcaa2cdf13f4596244c18d4a7ae5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:49 -0700 Subject: zram: introduce zram memory tracking zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch tell us last access time of each block of zram via "cat /sys/kernel/debug/zram/zram0/block_state". The output is as follows, 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. Admin can leverage this information to catch cold|incompressible pages of process with *pagemap* once part of heaps are swapped out. I used the feature a few years ago to find memory hoggers in userspace to notify them what memory they have wasted without touch for a long time. With it, they could reduce unnecessary memory space. However, at that time, I hacked up zram for the feature but now I need the feature again so I decided it would be better to upstream rather than keeping it alone. I hope I submit the userspace tool to use the feature soon. [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: use ktime_get_boottime() instead of sched_clock()] Link: http://lkml.kernel.org/r/20180420063525.GA253739@rodete-desktop-imager.corp.google.com [akpm@linux-foundation.org: documentation tweak] [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: fix compile warning] Link: http://lkml.kernel.org/r/20180508104849.GA8209@rodete-desktop-imager.corp.google.com [rdunlap@infradead.org: fix printk formats] Link: http://lkml.kernel.org/r/3652ccb1-96ef-0b0b-05d1-f661d7733dcc@infradead.org Link: http://lkml.kernel.org/r/20180416090946.63057-5-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Randy Dunlap Reviewed-by: Sergey Senozhatsky Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 24 ++++++++ drivers/block/zram/Kconfig | 14 ++++- drivers/block/zram/zram_drv.c | 132 ++++++++++++++++++++++++++++++++++++---- drivers/block/zram/zram_drv.h | 7 ++- 4 files changed, 163 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 78db38d02bc9..875b2b56b87f 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -243,5 +243,29 @@ to backing storage rather than keeping it in memory. User should set up backing device via /sys/block/zramX/backing_dev before disksize setting. += memory tracking + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows, + + 300 75.033841 .wh + 301 63.806904 s.. + 302 63.806919 ..h + +First column is zram's block index. +Second column is access time since the system was booted +Third column is state of the block. +(s: same page +w: written page to backing store +h: huge page) + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index ac3a31d433b2..635235759a0a 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,7 +13,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See zram.txt for more information. + See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK bool "Write back incompressible page to backing device" @@ -25,4 +25,14 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. - See zram.txt for more infomration. + See Documentation/blockdev/zram.txt for more information. + +config ZRAM_MEMORY_TRACKING + bool "Track zRam block status" + depends on ZRAM && DEBUG_FS + help + With this feature, admin can track the state of allocated blocks + of zRAM. Admin could see the information via + /sys/kernel/debug/zram/zramX/block_state. + + See Documentation/blockdev/zram.txt for more information. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7fc10e2ad734..da51293e7c03 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "zram_drv.h" @@ -67,6 +68,13 @@ static inline bool init_done(struct zram *zram) return zram->disksize; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + + return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + zram->table[index].handle; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -83,7 +91,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram *zram, u32 index, +static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].value & BIT(flag); @@ -107,16 +115,6 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } -static void zram_accessed(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = sched_clock(); -} - -static void zram_reset_access(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = 0; -} - static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -620,6 +618,114 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, static void zram_wb_clear(struct zram *zram, u32 index) {} #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + +static struct dentry *zram_debugfs_root; + +static void zram_debugfs_create(void) +{ + zram_debugfs_root = debugfs_create_dir("zram", NULL); +} + +static void zram_debugfs_destroy(void) +{ + debugfs_remove_recursive(zram_debugfs_root); +} + +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = ktime_get_boottime(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + +static ssize_t read_block_state(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t index, written = 0; + struct zram *zram = file->private_data; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct timespec64 ts; + + kbuf = kvmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + kvfree(kbuf); + return -EINVAL; + } + + for (index = *ppos; index < nr_pages; index++) { + int copied; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + ts = ktime_to_timespec64(zram->table[index].ac_time); + copied = snprintf(kbuf + written, count, + "%12zd %12lld.%06lu %c%c%c\n", + index, (s64)ts.tv_sec, + ts.tv_nsec / NSEC_PER_USEC, + zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', + zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + + if (count < copied) { + zram_slot_unlock(zram, index); + break; + } + written += copied; + count -= copied; +next: + zram_slot_unlock(zram, index); + *ppos += 1; + } + + up_read(&zram->init_lock); + if (copy_to_user(buf, kbuf, written)) + written = -EFAULT; + kvfree(kbuf); + + return written; +} + +static const struct file_operations proc_zram_block_state_op = { + .open = simple_open, + .read = read_block_state, + .llseek = default_llseek, +}; + +static void zram_debugfs_register(struct zram *zram) +{ + if (!zram_debugfs_root) + return; + + zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, + zram_debugfs_root); + debugfs_create_file("block_state", 0400, zram->debugfs_dir, + zram, &proc_zram_block_state_op); +} + +static void zram_debugfs_unregister(struct zram *zram) +{ + debugfs_remove_recursive(zram->debugfs_dir); +} +#else +static void zram_debugfs_create(void) {}; +static void zram_debugfs_destroy(void) {}; +static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_reset_access(struct zram *zram, u32 index) {}; +static void zram_debugfs_register(struct zram *zram) {}; +static void zram_debugfs_unregister(struct zram *zram) {}; +#endif /* * We switched to per-cpu streams and this attr is not needed anymore. @@ -1604,6 +1710,7 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -1637,6 +1744,7 @@ static int zram_remove(struct zram *zram) zram->claim = true; mutex_unlock(&bdev->bd_mutex); + zram_debugfs_unregister(zram); /* * Remove sysfs first, so no one will perform a disksize * store while we destroy the devices. This also helps during @@ -1739,6 +1847,7 @@ static void destroy_devices(void) { class_unregister(&zram_control_class); idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + zram_debugfs_destroy(); idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); @@ -1760,6 +1869,7 @@ static int __init zram_init(void) return ret; } + zram_debugfs_create(); zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_err("Unable to get major number\n"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1075218e88b2..72c8584b6dff 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -61,7 +61,9 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; - u64 ac_time; +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; +#endif }; struct zram_stats { @@ -110,5 +112,8 @@ struct zram { unsigned long nr_pages; spinlock_t bitmap_lock; #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; +#endif }; #endif -- cgit v1.2.3 From 3010a5ea665a089361e435093bd737399123fcc4 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 7 Jun 2018 17:06:08 -0700 Subject: mm: introduce ARCH_HAS_PTE_SPECIAL Currently the PTE special supports is turned on in per architecture header files. Most of the time, it is defined in arch/*/include/asm/pgtable.h depending or not on some other per architecture static definition. This patch introduce a new configuration variable to manage this directly in the Kconfig files. It would later replace __HAVE_ARCH_PTE_SPECIAL. Here notes for some architecture where the definition of __HAVE_ARCH_PTE_SPECIAL is not obvious: arm __HAVE_ARCH_PTE_SPECIAL which is currently defined in arch/arm/include/asm/pgtable-3level.h which is included by arch/arm/include/asm/pgtable.h when CONFIG_ARM_LPAE is set. So select ARCH_HAS_PTE_SPECIAL if ARM_LPAE. powerpc __HAVE_ARCH_PTE_SPECIAL is defined in 2 files: - arch/powerpc/include/asm/book3s/64/pgtable.h - arch/powerpc/include/asm/pte-common.h The first one is included if (PPC_BOOK3S & PPC64) while the second is included in all the other cases. So select ARCH_HAS_PTE_SPECIAL all the time. sparc: __HAVE_ARCH_PTE_SPECIAL is defined if defined(__sparc__) && defined(__arch64__) which are defined through the compiler in sparc/Makefile if !SPARC32 which I assume to be if SPARC64. So select ARCH_HAS_PTE_SPECIAL if SPARC64 There is no functional change introduced by this patch. Link: http://lkml.kernel.org/r/1523433816-14460-2-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Suggested-by: Jerome Glisse Reviewed-by: Jerome Glisse Acked-by: David Rientjes Cc: Michal Hocko Cc: "Aneesh Kumar K . V" Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Will Deacon Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Vineet Gupta Cc: Palmer Dabbelt Cc: Albert Ou Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Rientjes Cc: Robin Murphy Cc: Christophe LEROY Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/features/vm/pte_special/arch-support.txt | 2 +- arch/arc/Kconfig | 1 + arch/arc/include/asm/pgtable.h | 2 -- arch/arm/Kconfig | 1 + arch/arm/include/asm/pgtable-3level.h | 1 - arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 2 -- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/pgtable.h | 3 --- arch/powerpc/include/asm/pte-common.h | 3 --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable-bits.h | 3 --- arch/s390/Kconfig | 1 + arch/s390/include/asm/pgtable.h | 1 - arch/sh/Kconfig | 1 + arch/sh/include/asm/pgtable.h | 2 -- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/pgtable_64.h | 3 --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable_types.h | 1 - include/linux/pfn_t.h | 4 ++-- mm/Kconfig | 3 +++ mm/gup.c | 4 ++-- mm/memory.c | 2 +- 24 files changed, 18 insertions(+), 27 deletions(-) (limited to 'Documentation') diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 6a608a6dcf71..a8378424bc98 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -1,6 +1,6 @@ # # Feature name: pte_special -# Kconfig: __HAVE_ARCH_PTE_SPECIAL +# Kconfig: ARCH_HAS_PTE_SPECIAL # description: arch supports the pte_special()/pte_mkspecial() VM APIs # ----------------------- diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 89d47eac18b2..e81bcd271be7 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -48,6 +48,7 @@ config ARC select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA + select ARCH_HAS_PTE_SPECIAL config MIGHT_HAVE_PCI bool diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 08fe33830d4b..8ec5599a0957 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE)); PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8f460bdd4be1..534563ac7f5f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -8,6 +8,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_SET_MEMORY select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a4836087358..6d50a11d7793 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte) pte_val(pte) |= L_PTE_SPECIAL; return pte; } -#define __HAVE_ARCH_PTE_SPECIAL #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b25ed7834f6c..4759566a78cb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7c4c8f318ba9..9f82d6b53851 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pgd_pte(pgd_t pgd) { return __pte(pgd_val(pgd)); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 076fe3094856..8f959df2de7a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,6 +135,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 42fe7c2ff2df..63cee159022b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -335,9 +335,6 @@ extern unsigned long pci_io_base; /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 050b0d775324..bef56141a549 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -208,9 +208,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef _PAGE_READ /* if not defined, we should not find _PAGE_WRITE too */ #define _PAGE_READ 0 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 274bc064c41f..17f19e67993b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -42,6 +42,7 @@ config RISCV select THREAD_INFO_IN_TASK select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER + select ARCH_HAS_PTE_SPECIAL config MMU def_bool y diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 997ddbb1d370..2fa2942be221 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -42,7 +42,4 @@ _PAGE_WRITE | _PAGE_EXEC | \ _PAGE_USER | _PAGE_GLOBAL)) -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #endif /* _ASM_RISCV_PGTABLE_BITS_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b7deee7e738f..baed39772c84 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d24d33bf188..9809694e1389 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr) #define _PAGE_WRITE 0x020 /* SW pte write bit */ #define _PAGE_SPECIAL 0x040 /* SW associated with special page */ #define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */ -#define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ae619d54018c..4d61a085982b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config SUPERH def_bool y + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_COHERENT_DMA_MMAP if !MMU diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 89c513a982fc..f6abfe2bca93 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define __HAVE_ARCH_PTE_SPECIAL - #include #endif /* __ASM_SH_PGTABLE_H */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index b42ba888217d..9a2b8877f174 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -88,6 +88,7 @@ config SPARC64 select ARCH_USE_QUEUED_SPINLOCKS select GENERIC_TIME_VSYSCALL select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_PTE_SPECIAL config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 44d6ac47e035..1393a8ac596b 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_PUD_HUGE _PAGE_PMD_HUGE -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - /* SUN4U pte bits... */ #define _PAGE_SZ4MB_4U _AC(0x6000000000000000,UL) /* 4MB Page */ #define _PAGE_SZ512K_4U _AC(0x4000000000000000,UL) /* 512K Page */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb6e3a219294..f182a4e8e5bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 1e5a40673953..99fff853c944 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -65,7 +65,6 @@ #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif -#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a03c2642a87c..21713dc14ce2 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud); #endif #endif /* __HAVE_ARCH_PTE_DEVMAP */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) { return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL; @@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn) { return false; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #endif /* _LINUX_PFN_T_H_ */ diff --git a/mm/Kconfig b/mm/Kconfig index 3e0b6e87f65d..00bffa7a5112 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -754,3 +754,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/gup.c b/mm/gup.c index 541904a7c60f..010153989b9b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1354,7 +1354,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1430,7 +1430,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 6a97893a8de7..42d2d4366c11 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,7 +817,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL # define HAVE_PTE_SPECIAL 1 #else # define HAVE_PTE_SPECIAL 0 -- cgit v1.2.3 From 7854207fe9545181b048df4e684def36306a86ec Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:29 -0700 Subject: mm/docs: describe memory.low refinements Refine cgroup v2 docs after latest memory.low changes. Link: http://lkml.kernel.org/r/20180405185921.4942-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b0dda10b9382..7b56ca80e37a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1005,10 +1005,17 @@ PAGE_SIZE multiple when read back. A read-write single value file which exists on non-root cgroups. The default is "0". - Best-effort memory protection. If the memory usages of a - cgroup and all its ancestors are below their low boundaries, - the cgroup's memory won't be reclaimed unless memory can be - reclaimed from unprotected cgroups. + Best-effort memory protection. If the memory usage of a + cgroup is within its effective low boundary, the cgroup's + memory won't be reclaimed unless memory can be reclaimed + from unprotected cgroups. + + Effective low boundary is limited by memory.low values of + all ancestor cgroups. If there is memory.low overcommitment + (child cgroup or cgroups are requiring more protected memory, + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to the its + actual memory usage below memory.low. Putting more memory than generally available under this protection is discouraged. @@ -1950,17 +1957,8 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it and all its -ancestors are below their low boundaries, which makes delegation of -subtrees possible. Secondly, new cgroups have no reserve per default -and in the common case most cgroups are eligible for the preferred -reclaim pass. This allows the new low boundary to be efficiently -implemented with just a minor addition to the generic reclaim code, -without the need for out-of-band data structures and reclaim passes. -Because the generic reclaim code considers all cgroups except for the -ones running low in the preferred first reclaim pass, overreclaim of -individual groups is eliminated as well, resulting in much better -overall workload performance. +reserve. A cgroup enjoys reclaim protection when it's within its low, +which makes delegation of subtrees possible. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. -- cgit v1.2.3 From bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:07:46 -0700 Subject: memcg: introduce memory.min Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [guro@fb.com: s/low/min/ in docs] Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com Signed-off-by: Roman Gushchin Reviewed-by: Randy Dunlap Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 27 +++++++- include/linux/memcontrol.h | 15 ++-- include/linux/page_counter.h | 11 ++- mm/memcontrol.c | 118 +++++++++++++++++++++++++------- mm/page_counter.c | 63 ++++++++++++----- mm/vmscan.c | 18 ++++- 6 files changed, 202 insertions(+), 50 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7b56ca80e37a..e34d3c938729 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back. The total amount of memory currently being used by the cgroup and its descendants. + memory.min + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Hard memory protection. If the memory usage of a cgroup + is within its effective min boundary, the cgroup's memory + won't be reclaimed under any conditions. If there is no + unprotected reclaimable memory available, OOM killer + is invoked. + + Effective min boundary is limited by memory.min values of + all ancestor cgroups. If there is memory.min overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.min. + + Putting more memory than generally available under this + protection is discouraged and may lead to constant OOMs. + + If a memory cgroup is not populated with processes, + its memory.min is ignored. + memory.low A read-write single value file which exists on non-root cgroups. The default is "0". @@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment - (child cgroup or cgroups are requiring more protected memory, + (child cgroup or cgroups are requiring more protected memory than parent will allow), then each child cgroup will get - the part of parent's protection proportional to the its + the part of parent's protection proportional to its actual memory usage below memory.low. Putting more memory than generally available under this diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 10d741e8fe51..9c04cf8e6487 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,12 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7902a727d3b6..bab7e57f659b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -8,10 +8,16 @@ struct page_counter { atomic_long_t usage; - unsigned long max; + unsigned long min; unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; @@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6de0d6a3a8d..e3d56927a724 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); @@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, @@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is in the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is in the normal range. + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected * - * @root is exclusive; it is never low when looked at directly + * @root is exclusive; it is never protected when looked at directly * - * To provide a proper hierarchical behavior, effective memory.low value - * is used. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * * Effective memory.low is always equal or less than the original memory.low. * If there is no memory.low overcommittment (which is always true for @@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = { * E/memory.current = 0 * * These calculations require constant tracking of the actual low usages - * (see propagate_low_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_low() + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() * path for each memory cgroup top-down from the reclaim, * it's possible to optimize this part, and save calculated elow * for next usage. This part is intentionally racy, but it's ok, * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { - unsigned long usage, low_usage, siblings_low_usage; - unsigned long elow, parent_elow; struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; - elow = memcg->memory.low; usage = page_counter_read(&memcg->memory); - parent = parent_mem_cgroup(memcg); + if (!usage) + return MEMCG_PROT_NONE; + + emin = memcg->memory.min; + elow = memcg->memory.low; + parent = parent_mem_cgroup(memcg); if (parent == root) goto exit; + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); + } + parent_elow = READ_ONCE(parent->memory.elow); elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; - if (!elow || !parent_elow) - goto exit; + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); - - if (!low_usage || !siblings_low_usage) - goto exit; + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } - elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: + memcg->memory.emin = emin; memcg->memory.elow = elow; - return usage && usage <= elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index a5ff4cbc355a..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,26 +13,38 @@ #include #include -static void propagate_low_usage(struct page_counter *c, unsigned long usage) +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) { - unsigned long low_usage, old; + unsigned long protected, old_protected; long delta; if (!c->parent) return; - if (!c->low && !atomic_long_read(&c->low_usage)) - return; + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } - if (usage <= c->low) - low_usage = usage; - else - low_usage = 0; + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; - old = atomic_long_xchg(&c->low_usage, low_usage); - delta = low_usage - old; - if (delta) - atomic_long_add(delta, &c->parent->children_low_usage); + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } } /** @@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_set_low - set the amount of protected memory * @counter: counter @@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) counter->low = nr_pages; for (c = counter; c; c = c->parent) - propagate_low_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e67b477ecef..03822f86f288 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; -- cgit v1.2.3 From be09102b4190561b67e3809b07a7fd29c9774152 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:09:21 -0700 Subject: mm: memcg: allow lowering memory.swap.max below the current usage Currently an attempt to set swap.max into a value lower than the actual swap usage fails, which causes configuration problems as there's no way of lowering the configuration below the current usage short of turning off swap entirely. This makes swap.max difficult to use and allows delegatees to lock the delegator out of reducing swap allocation. This patch updates swap_max_write() so that the limit can be lowered below the current usage. It doesn't implement active reclaiming of swap entries for the following reasons. * mem_cgroup_swap_full() already tells the swap machinary to aggressively reclaim swap entries if the usage is above 50% of limit, so simply lowering the limit automatically triggers gradual reclaim. * Forcing back swapped out pages is likely to heavily impact the workload and mess up the working set. Given that swap usually is a lot less valuable and less scarce, letting the existing usage dissipate over time through the above gradual reclaim and as they're falted back in is likely the better behavior. Link: http://lkml.kernel.org/r/20180523185041.GR1718769@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Acked-by: Roman Gushchin Acked-by: Rik van Riel Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ mm/memcontrol.c | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index e34d3c938729..8a2c52d5c53b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,6 +1245,11 @@ PAGE_SIZE multiple when read back. because of running out of swap system-wide or max limit. + When reduced under the current usage, the existing swap + entries are reclaimed gradually and the swap usage may stay + higher than the limit for an extended period of time. This + reduces the impact on the workload and memory management. + Usage Guidelines ~~~~~~~~~~~~~~~~ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3d56927a724..c1e64d60ed02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6280,11 +6280,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_max_mutex); - err = page_counter_set_max(&memcg->swap, max); - mutex_unlock(&memcg_max_mutex); - if (err) - return err; + xchg(&memcg->swap.max, max); return nbytes; } -- cgit v1.2.3 From 9005d833385508d647680bfac90f9b2cf2ac5838 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:35 -0700 Subject: autofs: rename autofs documentation files There are two files in Documentation/filsystems that should now use autofs rather than autofs4 in their names. Link: http://lkml.kernel.org/r/152626707957.28589.3325300375892913999.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/autofs-mount-control.txt | 406 ++++++++++++++++ Documentation/filesystems/autofs.txt | 529 +++++++++++++++++++++ .../filesystems/autofs4-mount-control.txt | 407 ---------------- Documentation/filesystems/autofs4.txt | 529 --------------------- 4 files changed, 935 insertions(+), 936 deletions(-) create mode 100644 Documentation/filesystems/autofs-mount-control.txt create mode 100644 Documentation/filesystems/autofs.txt delete mode 100644 Documentation/filesystems/autofs4-mount-control.txt delete mode 100644 Documentation/filesystems/autofs4.txt (limited to 'Documentation') diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt new file mode 100644 index 000000000000..52c1b6cdfc91 --- /dev/null +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -0,0 +1,406 @@ + +Miscellaneous Device control operations for the autofs4 kernel module +==================================================================== + +The problem +=========== + +There is a problem with active restarts in autofs (that is to say +restarting autofs when there are busy mounts). + +During normal operation autofs uses a file descriptor opened on the +directory that is being managed in order to be able to issue control +operations. Using a file descriptor gives ioctl operations access to +autofs specific information stored in the super block. The operations +are things such as setting an autofs mount catatonic, setting the +expire timeout and requesting expire checks. As is explained below, +certain types of autofs triggered mounts can end up covering an autofs +mount itself which prevents us being able to use open(2) to obtain a +file descriptor for these operations if we don't already have one open. + +Currently autofs uses "umount -l" (lazy umount) to clear active mounts +at restart. While using lazy umount works for most cases, anything that +needs to walk back up the mount tree to construct a path, such as +getcwd(2) and the proc file system /proc//cwd, no longer works +because the point from which the path is constructed has been detached +from the mount tree. + +The actual problem with autofs is that it can't reconnect to existing +mounts. Immediately one thinks of just adding the ability to remount +autofs file systems would solve it, but alas, that can't work. This is +because autofs direct mounts and the implementation of "on demand mount +and expire" of nested mount trees have the file system mounted directly +on top of the mount trigger directory dentry. + +For example, there are two types of automount maps, direct (in the kernel +module source you will see a third type called an offset, which is just +a direct mount in disguise) and indirect. + +Here is a master map with direct and indirect map entries: + +/- /etc/auto.direct +/test /etc/auto.indirect + +and the corresponding map files: + +/etc/auto.direct: + +/automount/dparse/g6 budgie:/autofs/export1 +/automount/dparse/g1 shark:/autofs/export1 +and so on. + +/etc/auto.indirect: + +g1 shark:/autofs/export1 +g6 budgie:/autofs/export1 +and so on. + +For the above indirect map an autofs file system is mounted on /test and +mounts are triggered for each sub-directory key by the inode lookup +operation. So we see a mount of shark:/autofs/export1 on /test/g1, for +example. + +The way that direct mounts are handled is by making an autofs mount on +each full path, such as /automount/dparse/g1, and using it as a mount +trigger. So when we walk on the path we mount shark:/autofs/export1 "on +top of this mount point". Since these are always directories we can +use the follow_link inode operation to trigger the mount. + +But, each entry in direct and indirect maps can have offsets (making +them multi-mount map entries). + +For example, an indirect mount map entry could also be: + +g1 \ + / shark:/autofs/export5/testing/test \ + /s1 shark:/autofs/export/testing/test/s1 \ + /s2 shark:/autofs/export5/testing/test/s2 \ + /s1/ss1 shark:/autofs/export1 \ + /s2/ss2 shark:/autofs/export2 + +and a similarly a direct mount map entry could also be: + +/automount/dparse/g1 \ + / shark:/autofs/export5/testing/test \ + /s1 shark:/autofs/export/testing/test/s1 \ + /s2 shark:/autofs/export5/testing/test/s2 \ + /s1/ss1 shark:/autofs/export2 \ + /s2/ss2 shark:/autofs/export2 + +One of the issues with version 4 of autofs was that, when mounting an +entry with a large number of offsets, possibly with nesting, we needed +to mount and umount all of the offsets as a single unit. Not really a +problem, except for people with a large number of offsets in map entries. +This mechanism is used for the well known "hosts" map and we have seen +cases (in 2.4) where the available number of mounts are exhausted or +where the number of privileged ports available is exhausted. + +In version 5 we mount only as we go down the tree of offsets and +similarly for expiring them which resolves the above problem. There is +somewhat more detail to the implementation but it isn't needed for the +sake of the problem explanation. The one important detail is that these +offsets are implemented using the same mechanism as the direct mounts +above and so the mount points can be covered by a mount. + +The current autofs implementation uses an ioctl file descriptor opened +on the mount point for control operations. The references held by the +descriptor are accounted for in checks made to determine if a mount is +in use and is also used to access autofs file system information held +in the mount super block. So the use of a file handle needs to be +retained. + + +The Solution +============ + +To be able to restart autofs leaving existing direct, indirect and +offset mounts in place we need to be able to obtain a file handle +for these potentially covered autofs mount points. Rather than just +implement an isolated operation it was decided to re-implement the +existing ioctl interface and add new operations to provide this +functionality. + +In addition, to be able to reconstruct a mount tree that has busy mounts, +the uid and gid of the last user that triggered the mount needs to be +available because these can be used as macro substitution variables in +autofs maps. They are recorded at mount request time and an operation +has been added to retrieve them. + +Since we're re-implementing the control interface, a couple of other +problems with the existing interface have been addressed. First, when +a mount or expire operation completes a status is returned to the +kernel by either a "send ready" or a "send fail" operation. The +"send fail" operation of the ioctl interface could only ever send +ENOENT so the re-implementation allows user space to send an actual +status. Another expensive operation in user space, for those using +very large maps, is discovering if a mount is present. Usually this +involves scanning /proc/mounts and since it needs to be done quite +often it can introduce significant overhead when there are many entries +in the mount table. An operation to lookup the mount status of a mount +point dentry (covered or not) has also been added. + +Current kernel development policy recommends avoiding the use of the +ioctl mechanism in favor of systems such as Netlink. An implementation +using this system was attempted to evaluate its suitability and it was +found to be inadequate, in this case. The Generic Netlink system was +used for this as raw Netlink would lead to a significant increase in +complexity. There's no question that the Generic Netlink system is an +elegant solution for common case ioctl functions but it's not a complete +replacement probably because its primary purpose in life is to be a +message bus implementation rather than specifically an ioctl replacement. +While it would be possible to work around this there is one concern +that lead to the decision to not use it. This is that the autofs +expire in the daemon has become far to complex because umount +candidates are enumerated, almost for no other reason than to "count" +the number of times to call the expire ioctl. This involves scanning +the mount table which has proved to be a big overhead for users with +large maps. The best way to improve this is try and get back to the +way the expire was done long ago. That is, when an expire request is +issued for a mount (file handle) we should continually call back to +the daemon until we can't umount any more mounts, then return the +appropriate status to the daemon. At the moment we just expire one +mount at a time. A Generic Netlink implementation would exclude this +possibility for future development due to the requirements of the +message bus architecture. + + +autofs4 Miscellaneous Device mount control interface +==================================================== + +The control interface is opening a device node, typically /dev/autofs. + +All the ioctls use a common structure to pass the needed parameter +information and return operation results: + +struct autofs_dev_ioctl { + __u32 ver_major; + __u32 ver_minor; + __u32 size; /* total size of data passed in + * including this struct */ + __s32 ioctlfd; /* automount command fd */ + + /* Command parameters */ + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; + + char path[0]; +}; + +The ioctlfd field is a mount point file descriptor of an autofs mount +point. It is returned by the open call and is used by all calls except +the check for whether a given path is a mount point, where it may +optionally be used to check a specific mount corresponding to a given +mount point file descriptor, and when requesting the uid and gid of the +last successful mount on a directory within the autofs file system. + +The union is used to communicate parameters and results of calls made +as described below. + +The path field is used to pass a path where it is needed and the size field +is used account for the increased structure length when translating the +structure sent from user space. + +This structure can be initialized before setting specific fields by using +the void function call init_autofs_dev_ioctl(struct autofs_dev_ioctl *). + +All of the ioctls perform a copy of this structure from user space to +kernel space and return -EINVAL if the size parameter is smaller than +the structure size itself, -ENOMEM if the kernel memory allocation fails +or -EFAULT if the copy itself fails. Other checks include a version check +of the compiled in user space version against the module version and a +mismatch results in a -EINVAL return. If the size field is greater than +the structure size then a path is assumed to be present and is checked to +ensure it begins with a "/" and is NULL terminated, otherwise -EINVAL is +returned. Following these checks, for all ioctl commands except +AUTOFS_DEV_IOCTL_VERSION_CMD, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD and +AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD the ioctlfd is validated and if it is +not a valid descriptor or doesn't correspond to an autofs mount point +an error of -EBADF, -ENOTTY or -EINVAL (not an autofs descriptor) is +returned. + + +The ioctls +========== + +An example of an implementation which uses this interface can be seen +in autofs version 5.0.4 and later in file lib/dev-ioctl-lib.c of the +distribution tar available for download from kernel.org in directory +/pub/linux/daemons/autofs/v5. + +The device node ioctl operations implemented by this interface are: + + +AUTOFS_DEV_IOCTL_VERSION +------------------------ + +Get the major and minor version of the autofs4 device ioctl kernel module +implementation. It requires an initialized struct autofs_dev_ioctl as an +input parameter and sets the version information in the passed in structure. +It returns 0 on success or the error -EINVAL if a version mismatch is +detected. + + +AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD +------------------------------------------------------------------ + +Get the major and minor version of the autofs4 protocol version understood +by loaded module. This call requires an initialized struct autofs_dev_ioctl +with the ioctlfd field set to a valid autofs mount point descriptor +and sets the requested version number in version field of struct args_protover +or sub_version field of struct args_protosubver. These commands return +0 on success or one of the negative error codes if validation fails. + + +AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT +---------------------------------------------------------- + +Obtain and release a file descriptor for an autofs managed mount point +path. The open call requires an initialized struct autofs_dev_ioctl with +the path field set and the size field adjusted appropriately as well +as the devid field of struct args_openmount set to the device number of +the autofs mount. The device number can be obtained from the mount options +shown in /proc/mounts. The close call requires an initialized struct +autofs_dev_ioct with the ioctlfd field set to the descriptor obtained +from the open call. The release of the file descriptor can also be done +with close(2) so any open descriptors will also be closed at process exit. +The close call is included in the implemented operations largely for +completeness and to provide for a consistent user space implementation. + + +AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD +-------------------------------------------------------- + +Return mount and expire result status from user space to the kernel. +Both of these calls require an initialized struct autofs_dev_ioctl +with the ioctlfd field set to the descriptor obtained from the open +call and the token field of struct args_ready or struct args_fail set +to the wait queue token number, received by user space in the foregoing +mount or expire request. The status field of struct args_fail is set to +the errno of the operation. It is set to 0 on success. + + +AUTOFS_DEV_IOCTL_SETPIPEFD_CMD +------------------------------ + +Set the pipe file descriptor used for kernel communication to the daemon. +Normally this is set at mount time using an option but when reconnecting +to a existing mount we need to use this to tell the autofs mount about +the new kernel pipe descriptor. In order to protect mounts against +incorrectly setting the pipe descriptor we also require that the autofs +mount be catatonic (see next call). + +The call requires an initialized struct autofs_dev_ioctl with the +ioctlfd field set to the descriptor obtained from the open call and +the pipefd field of struct args_setpipefd set to descriptor of the pipe. +On success the call also sets the process group id used to identify the +controlling process (eg. the owning automount(8) daemon) to the process +group of the caller. + + +AUTOFS_DEV_IOCTL_CATATONIC_CMD +------------------------------ + +Make the autofs mount point catatonic. The autofs mount will no longer +issue mount requests, the kernel communication pipe descriptor is released +and any remaining waits in the queue released. + +The call requires an initialized struct autofs_dev_ioctl with the +ioctlfd field set to the descriptor obtained from the open call. + + +AUTOFS_DEV_IOCTL_TIMEOUT_CMD +---------------------------- + +Set the expire timeout for mounts within an autofs mount point. + +The call requires an initialized struct autofs_dev_ioctl with the +ioctlfd field set to the descriptor obtained from the open call. + + +AUTOFS_DEV_IOCTL_REQUESTER_CMD +------------------------------ + +Return the uid and gid of the last process to successfully trigger a the +mount on the given path dentry. + +The call requires an initialized struct autofs_dev_ioctl with the path +field set to the mount point in question and the size field adjusted +appropriately. Upon return the uid field of struct args_requester contains +the uid and gid field the gid. + +When reconstructing an autofs mount tree with active mounts we need to +re-connect to mounts that may have used the original process uid and +gid (or string variations of them) for mount lookups within the map entry. +This call provides the ability to obtain this uid and gid so they may be +used by user space for the mount map lookups. + + +AUTOFS_DEV_IOCTL_EXPIRE_CMD +--------------------------- + +Issue an expire request to the kernel for an autofs mount. Typically +this ioctl is called until no further expire candidates are found. + +The call requires an initialized struct autofs_dev_ioctl with the +ioctlfd field set to the descriptor obtained from the open call. In +addition an immediate expire, independent of the mount timeout, can be +requested by setting the how field of struct args_expire to 1. If no +expire candidates can be found the ioctl returns -1 with errno set to +EAGAIN. + +This call causes the kernel module to check the mount corresponding +to the given ioctlfd for mounts that can be expired, issues an expire +request back to the daemon and waits for completion. + +AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD +------------------------------ + +Checks if an autofs mount point is in use. + +The call requires an initialized struct autofs_dev_ioctl with the +ioctlfd field set to the descriptor obtained from the open call and +it returns the result in the may_umount field of struct args_askumount, +1 for busy and 0 otherwise. + + +AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD +--------------------------------- + +Check if the given path is a mountpoint. + +The call requires an initialized struct autofs_dev_ioctl. There are two +possible variations. Both use the path field set to the path of the mount +point to check and the size field adjusted appropriately. One uses the +ioctlfd field to identify a specific mount point to check while the other +variation uses the path and optionally in.type field of struct args_ismountpoint +set to an autofs mount type. The call returns 1 if this is a mount point +and sets out.devid field to the device number of the mount and out.magic +field to the relevant super block magic number (described below) or 0 if +it isn't a mountpoint. In both cases the the device number (as returned +by new_encode_dev()) is returned in out.devid field. + +If supplied with a file descriptor we're looking for a specific mount, +not necessarily at the top of the mounted stack. In this case the path +the descriptor corresponds to is considered a mountpoint if it is itself +a mountpoint or contains a mount, such as a multi-mount without a root +mount. In this case we return 1 if the descriptor corresponds to a mount +point and and also returns the super magic of the covering mount if there +is one or 0 if it isn't a mountpoint. + +If a path is supplied (and the ioctlfd field is set to -1) then the path +is looked up and is checked to see if it is the root of a mount. If a +type is also given we are looking for a particular autofs mount and if +a match isn't found a fail is returned. If the the located path is the +root of a mount 1 is returned along with the super magic of the mount +or 0 otherwise. diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt new file mode 100644 index 000000000000..f10dd590f69f --- /dev/null +++ b/Documentation/filesystems/autofs.txt @@ -0,0 +1,529 @@ + + + + +autofs - how it works +===================== + +Purpose +------- + +The goal of autofs is to provide on-demand mounting and race free +automatic unmounting of various other filesystems. This provides two +key advantages: + +1. There is no need to delay boot until all filesystems that + might be needed are mounted. Processes that try to access those + slow filesystems might be delayed but other processes can + continue freely. This is particularly important for + network filesystems (e.g. NFS) or filesystems stored on + media with a media-changing robot. + +2. The names and locations of filesystems can be stored in + a remote database and can change at any time. The content + in that data base at the time of access will be used to provide + a target for the access. The interpretation of names in the + filesystem can even be programmatic rather than database-backed, + allowing wildcards for example, and can vary based on the user who + first accessed a name. + +Context +------- + +The "autofs4" filesystem module is only one part of an autofs system. +There also needs to be a user-space program which looks up names +and mounts filesystems. This will often be the "automount" program, +though other tools including "systemd" can make use of "autofs4". +This document describes only the kernel module and the interactions +required with any user-space program. Subsequent text refers to this +as the "automount daemon" or simply "the daemon". + +"autofs4" is a Linux kernel module with provides the "autofs" +filesystem type. Several "autofs" filesystems can be mounted and they +can each be managed separately, or all managed by the same daemon. + +Content +------- + +An autofs filesystem can contain 3 sorts of objects: directories, +symbolic links and mount traps. Mount traps are directories with +extra properties as described in the next section. + +Objects can only be created by the automount daemon: symlinks are +created with a regular `symlink` system call, while directories and +mount traps are created with `mkdir`. The determination of whether a +directory should be a mount trap or not is quite _ad hoc_, largely for +historical reasons, and is determined in part by the +*direct*/*indirect*/*offset* mount options, and the *maxproto* mount option. + +If neither the *direct* or *offset* mount options are given (so the +mount is considered to be *indirect*), then the root directory is +always a regular directory, otherwise it is a mount trap when it is +empty and a regular directory when not empty. Note that *direct* and +*offset* are treated identically so a concise summary is that the root +directory is a mount trap only if the filesystem is mounted *direct* +and the root is empty. + +Directories created in the root directory are mount traps only if the +filesystem is mounted *indirect* and they are empty. + +Directories further down the tree depend on the *maxproto* mount +option and particularly whether it is less than five or not. +When *maxproto* is five, no directories further down the +tree are ever mount traps, they are always regular directories. When +the *maxproto* is four (or three), these directories are mount traps +precisely when they are empty. + +So: non-empty (i.e. non-leaf) directories are never mount traps. Empty +directories are sometimes mount traps, and sometimes not depending on +where in the tree they are (root, top level, or lower), the *maxproto*, +and whether the mount was *indirect* or not. + +Mount Traps +--------------- + +A core element of the implementation of autofs is the Mount Traps +which are provided by the Linux VFS. Any directory provided by a +filesystem can be designated as a trap. This involves two separate +features that work together to allow autofs to do its job. + +**DCACHE_NEED_AUTOMOUNT** + +If a dentry has the DCACHE_NEED_AUTOMOUNT flag set (which gets set if +the inode has S_AUTOMOUNT set, or can be set directly) then it is +(potentially) a mount trap. Any access to this directory beyond a +"`stat`" will (normally) cause the `d_op->d_automount()` dentry operation +to be called. The task of this method is to find the filesystem that +should be mounted on the directory and to return it. The VFS is +responsible for actually mounting the root of this filesystem on the +directory. + +autofs doesn't find the filesystem itself but sends a message to the +automount daemon asking it to find and mount the filesystem. The +autofs `d_automount` method then waits for the daemon to report that +everything is ready. It will then return "`NULL`" indicating that the +mount has already happened. The VFS doesn't try to mount anything but +follows down the mount that is already there. + +This functionality is sufficient for some users of mount traps such +as NFS which creates traps so that mountpoints on the server can be +reflected on the client. However it is not sufficient for autofs. As +mounting onto a directory is considered to be "beyond a `stat`", the +automount daemon would not be able to mount a filesystem on the 'trap' +directory without some way to avoid getting caught in the trap. For +that purpose there is another flag. + +**DCACHE_MANAGE_TRANSIT** + +If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but +related behaviors are invoked, both using the `d_op->d_manage()` +dentry operation. + +Firstly, before checking to see if any filesystem is mounted on the +directory, d_manage() will be called with the `rcu_walk` parameter set +to `false`. It may return one of three things: + +- A return value of zero indicates that there is nothing special + about this dentry and normal checks for mounts and automounts + should proceed. + + autofs normally returns zero, but first waits for any + expiry (automatic unmounting of the mounted filesystem) to + complete. This avoids races. + +- A return value of `-EISDIR` tells the VFS to ignore any mounts + on the directory and to not consider calling `->d_automount()`. + This effectively disables the **DCACHE_NEED_AUTOMOUNT** flag + causing the directory not be a mount trap after all. + + autofs returns this if it detects that the process performing the + lookup is the automount daemon and that the mount has been + requested but has not yet completed. How it determines this is + discussed later. This allows the automount daemon not to get + caught in the mount trap. + + There is a subtlety here. It is possible that a second autofs + filesystem can be mounted below the first and for both of them to + be managed by the same daemon. For the daemon to be able to mount + something on the second it must be able to "walk" down past the + first. This means that d_manage cannot *always* return -EISDIR for + the automount daemon. It must only return it when a mount has + been requested, but has not yet completed. + + `d_manage` also returns `-EISDIR` if the dentry shouldn't be a + mount trap, either because it is a symbolic link or because it is + not empty. + +- Any other negative value is treated as an error and returned + to the caller. + + autofs can return + + - -ENOENT if the automount daemon failed to mount anything, + - -ENOMEM if it ran out of memory, + - -EINTR if a signal arrived while waiting for expiry to + complete + - or any other error sent down by the automount daemon. + + +The second use case only occurs during an "RCU-walk" and so `rcu_walk` +will be set. + +An RCU-walk is a fast and lightweight process for walking down a +filename path (i.e. it is like running on tip-toes). RCU-walk cannot +cope with all situations so when it finds a difficulty it falls back +to "REF-walk", which is slower but more robust. + +RCU-walk will never call `->d_automount`; the filesystems must already +be mounted or RCU-walk cannot handle the path. +To determine if a mount-trap is safe for RCU-walk mode it calls +`->d_manage()` with `rcu_walk` set to `true`. + +In this case `d_manage()` must avoid blocking and should avoid taking +spinlocks if at all possible. Its sole purpose is to determine if it +would be safe to follow down into any mounted directory and the only +reason that it might not be is if an expiry of the mount is +underway. + +In the `rcu_walk` case, `d_manage()` cannot return -EISDIR to tell the +VFS that this is a directory that doesn't require d_automount. If +`rcu_walk` sees a dentry with DCACHE_NEED_AUTOMOUNT set but nothing +mounted, it *will* fall back to REF-walk. `d_manage()` cannot make the +VFS remain in RCU-walk mode, but can only tell it to get out of +RCU-walk mode by returning `-ECHILD`. + +So `d_manage()`, when called with `rcu_walk` set, should either return +-ECHILD if there is any reason to believe it is unsafe to end the +mounted filesystem, and otherwise should return 0. + +autofs will return `-ECHILD` if an expiry of the filesystem has been +initiated or is being considered, otherwise it returns 0. + + +Mountpoint expiry +----------------- + +The VFS has a mechanism for automatically expiring unused mounts, +much as it can expire any unused dentry information from the dcache. +This is guided by the MNT_SHRINKABLE flag. This only applies to +mounts that were created by `d_automount()` returning a filesystem to be +mounted. As autofs doesn't return such a filesystem but leaves the +mounting to the automount daemon, it must involve the automount daemon +in unmounting as well. This also means that autofs has more control +of expiry. + +The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to +the `umount` system call. Unmounting with MNT_EXPIRE will fail unless +a previous attempt had been made, and the filesystem has been inactive +and untouched since that previous attempt. autofs4 does not depend on +this but has its own internal tracking of whether filesystems were +recently used. This allows individual names in the autofs directory +to expire separately. + +With version 4 of the protocol, the automount daemon can try to +unmount any filesystems mounted on the autofs filesystem or remove any +symbolic links or empty directories any time it likes. If the unmount +or removal is successful the filesystem will be returned to the state +it was before the mount or creation, so that any access of the name +will trigger normal auto-mount processing. In particlar, `rmdir` and +`unlink` do not leave negative entries in the dcache as a normal +filesystem would, so an attempt to access a recently-removed object is +passed to autofs for handling. + +With version 5, this is not safe except for unmounting from top-level +directories. As lower-level directories are never mount traps, other +processes will see an empty directory as soon as the filesystem is +unmounted. So it is generally safest to use the autofs expiry +protocol described below. + +Normally the daemon only wants to remove entries which haven't been +used for a while. For this purpose autofs maintains a "`last_used`" +time stamp on each directory or symlink. For symlinks it genuinely +does record the last time the symlink was "used" or followed to find +out where it points to. For directories the field is a slight +misnomer. It actually records the last time that autofs checked if +the directory or one of its descendents was busy and found that it +was. This is just as useful and doesn't require updating the field so +often. + +The daemon is able to ask autofs if anything is due to be expired, +using an `ioctl` as discussed later. For a *direct* mount, autofs +considers if the entire mount-tree can be unmounted or not. For an +*indirect* mount, autofs considers each of the names in the top level +directory to determine if any of those can be unmounted and cleaned +up. + +There is an option with indirect mounts to consider each of the leaves +that has been mounted on instead of considering the top-level names. +This is intended for compatability with version 4 of autofs and should +be considered as deprecated. + +When autofs considers a directory it checks the `last_used` time and +compares it with the "timeout" value set when the filesystem was +mounted, though this check is ignored in some cases. It also checks if +the directory or anything below it is in use. For symbolic links, +only the `last_used` time is ever considered. + +If both appear to support expiring the directory or symlink, an action +is taken. + +There are two ways to ask autofs to consider expiry. The first is to +use the **AUTOFS_IOC_EXPIRE** ioctl. This only works for indirect +mounts. If it finds something in the root directory to expire it will +return the name of that thing. Once a name has been returned the +automount daemon needs to unmount any filesystems mounted below the +name normally. As described above, this is unsafe for non-toplevel +mounts in a version-5 autofs. For this reason the current `automountd` +does not use this ioctl. + +The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or +the **AUTOFS_IOC_EXPIRE_MULTI** ioctl. This will work for both direct and +indirect mounts. If it selects an object to expire, it will notify +the daemon using the notification mechanism described below. This +will block until the daemon acknowledges the expiry notification. +This implies that the "`EXPIRE`" ioctl must be sent from a different +thread than the one which handles notification. + +While the ioctl is blocking, the entry is marked as "expiring" and +`d_manage` will block until the daemon affirms that the unmount has +completed (together with removing any directories that might have been +necessary), or has been aborted. + +Communicating with autofs: detecting the daemon +----------------------------------------------- + +There are several forms of communication between the automount daemon +and the filesystem. As we have already seen, the daemon can create and +remove directories and symlinks using normal filesystem operations. +autofs knows whether a process requesting some operation is the daemon +or not based on its process-group id number (see getpgid(1)). + +When an autofs filesystem is mounted the pgid of the mounting +processes is recorded unless the "pgrp=" option is given, in which +case that number is recorded instead. Any request arriving from a +process in that process group is considered to come from the daemon. +If the daemon ever has to be stopped and restarted a new pgid can be +provided through an ioctl as will be described below. + +Communicating with autofs: the event pipe +----------------------------------------- + +When an autofs filesystem is mounted, the 'write' end of a pipe must +be passed using the 'fd=' mount option. autofs will write +notification messages to this pipe for the daemon to respond to. +For version 5, the format of the message is: + + struct autofs_v5_packet { + int proto_version; /* Protocol version */ + int type; /* Type of packet */ + autofs_wqt_t wait_queue_token; + __u32 dev; + __u64 ino; + __u32 uid; + __u32 gid; + __u32 pid; + __u32 tgid; + __u32 len; + char name[NAME_MAX+1]; + }; + +where the type is one of + + autofs_ptype_missing_indirect + autofs_ptype_expire_indirect + autofs_ptype_missing_direct + autofs_ptype_expire_direct + +so messages can indicate that a name is missing (something tried to +access it but it isn't there) or that it has been selected for expiry. + +The pipe will be set to "packet mode" (equivalent to passing +`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at +most one packet, and any unread portion of a packet will be discarded. + +The `wait_queue_token` is a unique number which can identify a +particular request to be acknowledged. When a message is sent over +the pipe the affected dentry is marked as either "active" or +"expiring" and other accesses to it block until the message is +acknowledged using one of the ioctls below and the relevant +`wait_queue_token`. + +Communicating with autofs: root directory ioctls +------------------------------------------------ + +The root directory of an autofs filesystem will respond to a number of +ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN +capability, or must be the automount daemon. + +The available ioctl commands are: + +- **AUTOFS_IOC_READY**: a notification has been handled. The argument + to the ioctl command is the "wait_queue_token" number + corresponding to the notification being acknowledged. +- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with + the error code `ENOENT`. +- **AUTOFS_IOC_CATATONIC**: Causes the autofs to enter "catatonic" + mode meaning that it stops sending notifications to the daemon. + This mode is also entered if a write to the pipe fails. +- **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use. +- **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which + is really a version number for the implementation. It is + currently 2. +- **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned + long. The value is used to set the timeout for expiry, and + the current timeout value is stored back through the pointer. +- **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if + the filesystem could be unmounted. This is only a hint as + the situation could change at any instant. This call can be + use to avoid a more expensive full unmount attempt. +- **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is + anything suitable to expire. A pointer to a packet: + + struct autofs_packet_expire_multi { + int proto_version; /* Protocol version */ + int type; /* Type of packet */ + autofs_wqt_t wait_queue_token; + int len; + char name[NAME_MAX+1]; + }; + + is required. This is filled in with the name of something + that can be unmounted or removed. If nothing can be expired, + `errno` is set to `EAGAIN`. Even though a `wait_queue_token` + is present in the structure, no "wait queue" is established + and no acknowledgment is needed. +- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to + **AUTOFS_IOC_EXPIRE** except that it causes notification to be + sent to the daemon, and it blocks until the daemon acknowledges. + The argument is an integer which can contain two different flags. + + **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored + and objects are expired if the are not in use. + + **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level + name to expire. This is only safe when *maxproto* is 4. + +Communicating with autofs: char-device ioctls +--------------------------------------------- + +It is not always possible to open the root of an autofs filesystem, +particularly a *direct* mounted filesystem. If the automount daemon +is restarted there is no way for it to regain control of existing +mounts using any of the above communication channels. To address this +need there is a "miscellaneous" character device (major 10, minor 235) +which can be used to communicate directly with the autofs filesystem. +It requires CAP_SYS_ADMIN for access. + +The `ioctl`s that can be used on this device are described in a separate +document `autofs4-mount-control.txt`, and are summarized briefly here. +Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: + + struct autofs_dev_ioctl { + __u32 ver_major; + __u32 ver_minor; + __u32 size; /* total size of data passed in + * including this struct */ + __s32 ioctlfd; /* automount command fd */ + + /* Command parameters */ + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; + + char path[0]; + }; + +For the **OPEN_MOUNT** and **IS_MOUNTPOINT** commands, the target +filesystem is identified by the `path`. All other commands identify +the filesystem by the `ioctlfd` which is a file descriptor open on the +root, and which can be returned by **OPEN_MOUNT**. + +The `ver_major` and `ver_minor` are in/out parameters which check that +the requested version is supported, and report the maximum version +that the kernel module can support. + +Commands are: + +- **AUTOFS_DEV_IOCTL_VERSION_CMD**: does nothing, except validate and + set version numbers. +- **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor + on the root of an autofs filesystem. The filesystem is identified + by name and device number, which is stored in `openmount.devid`. + Device numbers for existing filesystems can be found in + `/proc/self/mountinfo`. +- **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`. +- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in + catatonic mode, this can provide the write end of a new pipe + in `setpipefd.pipefd` to re-establish communication with a daemon. + The process group of the calling process is used to identify the + daemon. +- **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a + name within the filesystem that has been auto-mounted on. + On successful return, `requester.uid` and `requester.gid` will be + the UID and GID of the process which triggered that mount. +- **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a + mountpoint of a particular type - see separate documentation for + details. +- **AUTOFS_DEV_IOCTL_PROTOVER_CMD**: +- **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**: +- **AUTOFS_DEV_IOCTL_READY_CMD**: +- **AUTOFS_DEV_IOCTL_FAIL_CMD**: +- **AUTOFS_DEV_IOCTL_CATATONIC_CMD**: +- **AUTOFS_DEV_IOCTL_TIMEOUT_CMD**: +- **AUTOFS_DEV_IOCTL_EXPIRE_CMD**: +- **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**: These all have the same + function as the similarly named **AUTOFS_IOC** ioctls, except + that **FAIL** can be given an explicit error number in `fail.status` + instead of assuming `ENOENT`, and this **EXPIRE** command + corresponds to **AUTOFS_IOC_EXPIRE_MULTI**. + +Catatonic mode +-------------- + +As mentioned, an autofs mount can enter "catatonic" mode. This +happens if a write to the notification pipe fails, or if it is +explicitly requested by an `ioctl`. + +When entering catatonic mode, the pipe is closed and any pending +notifications are acknowledged with the error `ENOENT`. + +Once in catatonic mode attempts to access non-existing names will +result in `ENOENT` while attempts to access existing directories will +be treated in the same way as if they came from the daemon, so mount +traps will not fire. + +When the filesystem is mounted a _uid_ and _gid_ can be given which +set the ownership of directories and symbolic links. When the +filesystem is in catatonic mode, any process with a matching UID can +create directories or symlinks in the root directory, but not in other +directories. + +Catatonic mode can only be left via the +**AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`. + +autofs, name spaces, and shared mounts +-------------------------------------- + +With bind mounts and name spaces it is possible for an autofs +filesystem to appear at multiple places in one or more filesystem +name spaces. For this to work sensibly, the autofs filesystem should +always be mounted "shared". e.g. + +> `mount --make-shared /autofs/mount/point` + +The automount daemon is only able to manage a single mount location for +an autofs filesystem and if mounts on that are not 'shared', other +locations will not behave as expected. In particular access to those +other locations will likely result in the `ELOOP` error + +> Too many levels of symbolic links diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt deleted file mode 100644 index e5177cb31a04..000000000000 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ /dev/null @@ -1,407 +0,0 @@ - -Miscellaneous Device control operations for the autofs4 kernel module -==================================================================== - -The problem -=========== - -There is a problem with active restarts in autofs (that is to say -restarting autofs when there are busy mounts). - -During normal operation autofs uses a file descriptor opened on the -directory that is being managed in order to be able to issue control -operations. Using a file descriptor gives ioctl operations access to -autofs specific information stored in the super block. The operations -are things such as setting an autofs mount catatonic, setting the -expire timeout and requesting expire checks. As is explained below, -certain types of autofs triggered mounts can end up covering an autofs -mount itself which prevents us being able to use open(2) to obtain a -file descriptor for these operations if we don't already have one open. - -Currently autofs uses "umount -l" (lazy umount) to clear active mounts -at restart. While using lazy umount works for most cases, anything that -needs to walk back up the mount tree to construct a path, such as -getcwd(2) and the proc file system /proc//cwd, no longer works -because the point from which the path is constructed has been detached -from the mount tree. - -The actual problem with autofs is that it can't reconnect to existing -mounts. Immediately one thinks of just adding the ability to remount -autofs file systems would solve it, but alas, that can't work. This is -because autofs direct mounts and the implementation of "on demand mount -and expire" of nested mount trees have the file system mounted directly -on top of the mount trigger directory dentry. - -For example, there are two types of automount maps, direct (in the kernel -module source you will see a third type called an offset, which is just -a direct mount in disguise) and indirect. - -Here is a master map with direct and indirect map entries: - -/- /etc/auto.direct -/test /etc/auto.indirect - -and the corresponding map files: - -/etc/auto.direct: - -/automount/dparse/g6 budgie:/autofs/export1 -/automount/dparse/g1 shark:/autofs/export1 -and so on. - -/etc/auto.indirect: - -g1 shark:/autofs/export1 -g6 budgie:/autofs/export1 -and so on. - -For the above indirect map an autofs file system is mounted on /test and -mounts are triggered for each sub-directory key by the inode lookup -operation. So we see a mount of shark:/autofs/export1 on /test/g1, for -example. - -The way that direct mounts are handled is by making an autofs mount on -each full path, such as /automount/dparse/g1, and using it as a mount -trigger. So when we walk on the path we mount shark:/autofs/export1 "on -top of this mount point". Since these are always directories we can -use the follow_link inode operation to trigger the mount. - -But, each entry in direct and indirect maps can have offsets (making -them multi-mount map entries). - -For example, an indirect mount map entry could also be: - -g1 \ - / shark:/autofs/export5/testing/test \ - /s1 shark:/autofs/export/testing/test/s1 \ - /s2 shark:/autofs/export5/testing/test/s2 \ - /s1/ss1 shark:/autofs/export1 \ - /s2/ss2 shark:/autofs/export2 - -and a similarly a direct mount map entry could also be: - -/automount/dparse/g1 \ - / shark:/autofs/export5/testing/test \ - /s1 shark:/autofs/export/testing/test/s1 \ - /s2 shark:/autofs/export5/testing/test/s2 \ - /s1/ss1 shark:/autofs/export2 \ - /s2/ss2 shark:/autofs/export2 - -One of the issues with version 4 of autofs was that, when mounting an -entry with a large number of offsets, possibly with nesting, we needed -to mount and umount all of the offsets as a single unit. Not really a -problem, except for people with a large number of offsets in map entries. -This mechanism is used for the well known "hosts" map and we have seen -cases (in 2.4) where the available number of mounts are exhausted or -where the number of privileged ports available is exhausted. - -In version 5 we mount only as we go down the tree of offsets and -similarly for expiring them which resolves the above problem. There is -somewhat more detail to the implementation but it isn't needed for the -sake of the problem explanation. The one important detail is that these -offsets are implemented using the same mechanism as the direct mounts -above and so the mount points can be covered by a mount. - -The current autofs implementation uses an ioctl file descriptor opened -on the mount point for control operations. The references held by the -descriptor are accounted for in checks made to determine if a mount is -in use and is also used to access autofs file system information held -in the mount super block. So the use of a file handle needs to be -retained. - - -The Solution -============ - -To be able to restart autofs leaving existing direct, indirect and -offset mounts in place we need to be able to obtain a file handle -for these potentially covered autofs mount points. Rather than just -implement an isolated operation it was decided to re-implement the -existing ioctl interface and add new operations to provide this -functionality. - -In addition, to be able to reconstruct a mount tree that has busy mounts, -the uid and gid of the last user that triggered the mount needs to be -available because these can be used as macro substitution variables in -autofs maps. They are recorded at mount request time and an operation -has been added to retrieve them. - -Since we're re-implementing the control interface, a couple of other -problems with the existing interface have been addressed. First, when -a mount or expire operation completes a status is returned to the -kernel by either a "send ready" or a "send fail" operation. The -"send fail" operation of the ioctl interface could only ever send -ENOENT so the re-implementation allows user space to send an actual -status. Another expensive operation in user space, for those using -very large maps, is discovering if a mount is present. Usually this -involves scanning /proc/mounts and since it needs to be done quite -often it can introduce significant overhead when there are many entries -in the mount table. An operation to lookup the mount status of a mount -point dentry (covered or not) has also been added. - -Current kernel development policy recommends avoiding the use of the -ioctl mechanism in favor of systems such as Netlink. An implementation -using this system was attempted to evaluate its suitability and it was -found to be inadequate, in this case. The Generic Netlink system was -used for this as raw Netlink would lead to a significant increase in -complexity. There's no question that the Generic Netlink system is an -elegant solution for common case ioctl functions but it's not a complete -replacement probably because its primary purpose in life is to be a -message bus implementation rather than specifically an ioctl replacement. -While it would be possible to work around this there is one concern -that lead to the decision to not use it. This is that the autofs -expire in the daemon has become far to complex because umount -candidates are enumerated, almost for no other reason than to "count" -the number of times to call the expire ioctl. This involves scanning -the mount table which has proved to be a big overhead for users with -large maps. The best way to improve this is try and get back to the -way the expire was done long ago. That is, when an expire request is -issued for a mount (file handle) we should continually call back to -the daemon until we can't umount any more mounts, then return the -appropriate status to the daemon. At the moment we just expire one -mount at a time. A Generic Netlink implementation would exclude this -possibility for future development due to the requirements of the -message bus architecture. - - -autofs4 Miscellaneous Device mount control interface -==================================================== - -The control interface is opening a device node, typically /dev/autofs. - -All the ioctls use a common structure to pass the needed parameter -information and return operation results: - -struct autofs_dev_ioctl { - __u32 ver_major; - __u32 ver_minor; - __u32 size; /* total size of data passed in - * including this struct */ - __s32 ioctlfd; /* automount command fd */ - - /* Command parameters */ - union { - struct args_protover protover; - struct args_protosubver protosubver; - struct args_openmount openmount; - struct args_ready ready; - struct args_fail fail; - struct args_setpipefd setpipefd; - struct args_timeout timeout; - struct args_requester requester; - struct args_expire expire; - struct args_askumount askumount; - struct args_ismountpoint ismountpoint; - }; - - char path[0]; -}; - -The ioctlfd field is a mount point file descriptor of an autofs mount -point. It is returned by the open call and is used by all calls except -the check for whether a given path is a mount point, where it may -optionally be used to check a specific mount corresponding to a given -mount point file descriptor, and when requesting the uid and gid of the -last successful mount on a directory within the autofs file system. - -The union is used to communicate parameters and results of calls made -as described below. - -The path field is used to pass a path where it is needed and the size field -is used account for the increased structure length when translating the -structure sent from user space. - -This structure can be initialized before setting specific fields by using -the void function call init_autofs_dev_ioctl(struct autofs_dev_ioctl *). - -All of the ioctls perform a copy of this structure from user space to -kernel space and return -EINVAL if the size parameter is smaller than -the structure size itself, -ENOMEM if the kernel memory allocation fails -or -EFAULT if the copy itself fails. Other checks include a version check -of the compiled in user space version against the module version and a -mismatch results in a -EINVAL return. If the size field is greater than -the structure size then a path is assumed to be present and is checked to -ensure it begins with a "/" and is NULL terminated, otherwise -EINVAL is -returned. Following these checks, for all ioctl commands except -AUTOFS_DEV_IOCTL_VERSION_CMD, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD and -AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD the ioctlfd is validated and if it is -not a valid descriptor or doesn't correspond to an autofs mount point -an error of -EBADF, -ENOTTY or -EINVAL (not an autofs descriptor) is -returned. - - -The ioctls -========== - -An example of an implementation which uses this interface can be seen -in autofs version 5.0.4 and later in file lib/dev-ioctl-lib.c of the -distribution tar available for download from kernel.org in directory -/pub/linux/daemons/autofs/v5. - -The device node ioctl operations implemented by this interface are: - - -AUTOFS_DEV_IOCTL_VERSION ------------------------- - -Get the major and minor version of the autofs4 device ioctl kernel module -implementation. It requires an initialized struct autofs_dev_ioctl as an -input parameter and sets the version information in the passed in structure. -It returns 0 on success or the error -EINVAL if a version mismatch is -detected. - - -AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD ------------------------------------------------------------------- - -Get the major and minor version of the autofs4 protocol version understood -by loaded module. This call requires an initialized struct autofs_dev_ioctl -with the ioctlfd field set to a valid autofs mount point descriptor -and sets the requested version number in version field of struct args_protover -or sub_version field of struct args_protosubver. These commands return -0 on success or one of the negative error codes if validation fails. - - -AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT ----------------------------------------------------------- - -Obtain and release a file descriptor for an autofs managed mount point -path. The open call requires an initialized struct autofs_dev_ioctl with -the path field set and the size field adjusted appropriately as well -as the devid field of struct args_openmount set to the device number of -the autofs mount. The device number can be obtained from the mount options -shown in /proc/mounts. The close call requires an initialized struct -autofs_dev_ioct with the ioctlfd field set to the descriptor obtained -from the open call. The release of the file descriptor can also be done -with close(2) so any open descriptors will also be closed at process exit. -The close call is included in the implemented operations largely for -completeness and to provide for a consistent user space implementation. - - -AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD --------------------------------------------------------- - -Return mount and expire result status from user space to the kernel. -Both of these calls require an initialized struct autofs_dev_ioctl -with the ioctlfd field set to the descriptor obtained from the open -call and the token field of struct args_ready or struct args_fail set -to the wait queue token number, received by user space in the foregoing -mount or expire request. The status field of struct args_fail is set to -the errno of the operation. It is set to 0 on success. - - -AUTOFS_DEV_IOCTL_SETPIPEFD_CMD ------------------------------- - -Set the pipe file descriptor used for kernel communication to the daemon. -Normally this is set at mount time using an option but when reconnecting -to a existing mount we need to use this to tell the autofs mount about -the new kernel pipe descriptor. In order to protect mounts against -incorrectly setting the pipe descriptor we also require that the autofs -mount be catatonic (see next call). - -The call requires an initialized struct autofs_dev_ioctl with the -ioctlfd field set to the descriptor obtained from the open call and -the pipefd field of struct args_setpipefd set to descriptor of the pipe. -On success the call also sets the process group id used to identify the -controlling process (eg. the owning automount(8) daemon) to the process -group of the caller. - - -AUTOFS_DEV_IOCTL_CATATONIC_CMD ------------------------------- - -Make the autofs mount point catatonic. The autofs mount will no longer -issue mount requests, the kernel communication pipe descriptor is released -and any remaining waits in the queue released. - -The call requires an initialized struct autofs_dev_ioctl with the -ioctlfd field set to the descriptor obtained from the open call. - - -AUTOFS_DEV_IOCTL_TIMEOUT_CMD ----------------------------- - -Set the expire timeout for mounts within an autofs mount point. - -The call requires an initialized struct autofs_dev_ioctl with the -ioctlfd field set to the descriptor obtained from the open call. - - -AUTOFS_DEV_IOCTL_REQUESTER_CMD ------------------------------- - -Return the uid and gid of the last process to successfully trigger a the -mount on the given path dentry. - -The call requires an initialized struct autofs_dev_ioctl with the path -field set to the mount point in question and the size field adjusted -appropriately. Upon return the uid field of struct args_requester contains -the uid and gid field the gid. - -When reconstructing an autofs mount tree with active mounts we need to -re-connect to mounts that may have used the original process uid and -gid (or string variations of them) for mount lookups within the map entry. -This call provides the ability to obtain this uid and gid so they may be -used by user space for the mount map lookups. - - -AUTOFS_DEV_IOCTL_EXPIRE_CMD ---------------------------- - -Issue an expire request to the kernel for an autofs mount. Typically -this ioctl is called until no further expire candidates are found. - -The call requires an initialized struct autofs_dev_ioctl with the -ioctlfd field set to the descriptor obtained from the open call. In -addition an immediate expire, independent of the mount timeout, can be -requested by setting the how field of struct args_expire to 1. If no -expire candidates can be found the ioctl returns -1 with errno set to -EAGAIN. - -This call causes the kernel module to check the mount corresponding -to the given ioctlfd for mounts that can be expired, issues an expire -request back to the daemon and waits for completion. - -AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD ------------------------------- - -Checks if an autofs mount point is in use. - -The call requires an initialized struct autofs_dev_ioctl with the -ioctlfd field set to the descriptor obtained from the open call and -it returns the result in the may_umount field of struct args_askumount, -1 for busy and 0 otherwise. - - -AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD ---------------------------------- - -Check if the given path is a mountpoint. - -The call requires an initialized struct autofs_dev_ioctl. There are two -possible variations. Both use the path field set to the path of the mount -point to check and the size field adjusted appropriately. One uses the -ioctlfd field to identify a specific mount point to check while the other -variation uses the path and optionally in.type field of struct args_ismountpoint -set to an autofs mount type. The call returns 1 if this is a mount point -and sets out.devid field to the device number of the mount and out.magic -field to the relevant super block magic number (described below) or 0 if -it isn't a mountpoint. In both cases the the device number (as returned -by new_encode_dev()) is returned in out.devid field. - -If supplied with a file descriptor we're looking for a specific mount, -not necessarily at the top of the mounted stack. In this case the path -the descriptor corresponds to is considered a mountpoint if it is itself -a mountpoint or contains a mount, such as a multi-mount without a root -mount. In this case we return 1 if the descriptor corresponds to a mount -point and and also returns the super magic of the covering mount if there -is one or 0 if it isn't a mountpoint. - -If a path is supplied (and the ioctlfd field is set to -1) then the path -is looked up and is checked to see if it is the root of a mount. If a -type is also given we are looking for a particular autofs mount and if -a match isn't found a fail is returned. If the the located path is the -root of a mount 1 is returned along with the super magic of the mount -or 0 otherwise. - diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt deleted file mode 100644 index f10dd590f69f..000000000000 --- a/Documentation/filesystems/autofs4.txt +++ /dev/null @@ -1,529 +0,0 @@ - - - - -autofs - how it works -===================== - -Purpose -------- - -The goal of autofs is to provide on-demand mounting and race free -automatic unmounting of various other filesystems. This provides two -key advantages: - -1. There is no need to delay boot until all filesystems that - might be needed are mounted. Processes that try to access those - slow filesystems might be delayed but other processes can - continue freely. This is particularly important for - network filesystems (e.g. NFS) or filesystems stored on - media with a media-changing robot. - -2. The names and locations of filesystems can be stored in - a remote database and can change at any time. The content - in that data base at the time of access will be used to provide - a target for the access. The interpretation of names in the - filesystem can even be programmatic rather than database-backed, - allowing wildcards for example, and can vary based on the user who - first accessed a name. - -Context -------- - -The "autofs4" filesystem module is only one part of an autofs system. -There also needs to be a user-space program which looks up names -and mounts filesystems. This will often be the "automount" program, -though other tools including "systemd" can make use of "autofs4". -This document describes only the kernel module and the interactions -required with any user-space program. Subsequent text refers to this -as the "automount daemon" or simply "the daemon". - -"autofs4" is a Linux kernel module with provides the "autofs" -filesystem type. Several "autofs" filesystems can be mounted and they -can each be managed separately, or all managed by the same daemon. - -Content -------- - -An autofs filesystem can contain 3 sorts of objects: directories, -symbolic links and mount traps. Mount traps are directories with -extra properties as described in the next section. - -Objects can only be created by the automount daemon: symlinks are -created with a regular `symlink` system call, while directories and -mount traps are created with `mkdir`. The determination of whether a -directory should be a mount trap or not is quite _ad hoc_, largely for -historical reasons, and is determined in part by the -*direct*/*indirect*/*offset* mount options, and the *maxproto* mount option. - -If neither the *direct* or *offset* mount options are given (so the -mount is considered to be *indirect*), then the root directory is -always a regular directory, otherwise it is a mount trap when it is -empty and a regular directory when not empty. Note that *direct* and -*offset* are treated identically so a concise summary is that the root -directory is a mount trap only if the filesystem is mounted *direct* -and the root is empty. - -Directories created in the root directory are mount traps only if the -filesystem is mounted *indirect* and they are empty. - -Directories further down the tree depend on the *maxproto* mount -option and particularly whether it is less than five or not. -When *maxproto* is five, no directories further down the -tree are ever mount traps, they are always regular directories. When -the *maxproto* is four (or three), these directories are mount traps -precisely when they are empty. - -So: non-empty (i.e. non-leaf) directories are never mount traps. Empty -directories are sometimes mount traps, and sometimes not depending on -where in the tree they are (root, top level, or lower), the *maxproto*, -and whether the mount was *indirect* or not. - -Mount Traps ---------------- - -A core element of the implementation of autofs is the Mount Traps -which are provided by the Linux VFS. Any directory provided by a -filesystem can be designated as a trap. This involves two separate -features that work together to allow autofs to do its job. - -**DCACHE_NEED_AUTOMOUNT** - -If a dentry has the DCACHE_NEED_AUTOMOUNT flag set (which gets set if -the inode has S_AUTOMOUNT set, or can be set directly) then it is -(potentially) a mount trap. Any access to this directory beyond a -"`stat`" will (normally) cause the `d_op->d_automount()` dentry operation -to be called. The task of this method is to find the filesystem that -should be mounted on the directory and to return it. The VFS is -responsible for actually mounting the root of this filesystem on the -directory. - -autofs doesn't find the filesystem itself but sends a message to the -automount daemon asking it to find and mount the filesystem. The -autofs `d_automount` method then waits for the daemon to report that -everything is ready. It will then return "`NULL`" indicating that the -mount has already happened. The VFS doesn't try to mount anything but -follows down the mount that is already there. - -This functionality is sufficient for some users of mount traps such -as NFS which creates traps so that mountpoints on the server can be -reflected on the client. However it is not sufficient for autofs. As -mounting onto a directory is considered to be "beyond a `stat`", the -automount daemon would not be able to mount a filesystem on the 'trap' -directory without some way to avoid getting caught in the trap. For -that purpose there is another flag. - -**DCACHE_MANAGE_TRANSIT** - -If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but -related behaviors are invoked, both using the `d_op->d_manage()` -dentry operation. - -Firstly, before checking to see if any filesystem is mounted on the -directory, d_manage() will be called with the `rcu_walk` parameter set -to `false`. It may return one of three things: - -- A return value of zero indicates that there is nothing special - about this dentry and normal checks for mounts and automounts - should proceed. - - autofs normally returns zero, but first waits for any - expiry (automatic unmounting of the mounted filesystem) to - complete. This avoids races. - -- A return value of `-EISDIR` tells the VFS to ignore any mounts - on the directory and to not consider calling `->d_automount()`. - This effectively disables the **DCACHE_NEED_AUTOMOUNT** flag - causing the directory not be a mount trap after all. - - autofs returns this if it detects that the process performing the - lookup is the automount daemon and that the mount has been - requested but has not yet completed. How it determines this is - discussed later. This allows the automount daemon not to get - caught in the mount trap. - - There is a subtlety here. It is possible that a second autofs - filesystem can be mounted below the first and for both of them to - be managed by the same daemon. For the daemon to be able to mount - something on the second it must be able to "walk" down past the - first. This means that d_manage cannot *always* return -EISDIR for - the automount daemon. It must only return it when a mount has - been requested, but has not yet completed. - - `d_manage` also returns `-EISDIR` if the dentry shouldn't be a - mount trap, either because it is a symbolic link or because it is - not empty. - -- Any other negative value is treated as an error and returned - to the caller. - - autofs can return - - - -ENOENT if the automount daemon failed to mount anything, - - -ENOMEM if it ran out of memory, - - -EINTR if a signal arrived while waiting for expiry to - complete - - or any other error sent down by the automount daemon. - - -The second use case only occurs during an "RCU-walk" and so `rcu_walk` -will be set. - -An RCU-walk is a fast and lightweight process for walking down a -filename path (i.e. it is like running on tip-toes). RCU-walk cannot -cope with all situations so when it finds a difficulty it falls back -to "REF-walk", which is slower but more robust. - -RCU-walk will never call `->d_automount`; the filesystems must already -be mounted or RCU-walk cannot handle the path. -To determine if a mount-trap is safe for RCU-walk mode it calls -`->d_manage()` with `rcu_walk` set to `true`. - -In this case `d_manage()` must avoid blocking and should avoid taking -spinlocks if at all possible. Its sole purpose is to determine if it -would be safe to follow down into any mounted directory and the only -reason that it might not be is if an expiry of the mount is -underway. - -In the `rcu_walk` case, `d_manage()` cannot return -EISDIR to tell the -VFS that this is a directory that doesn't require d_automount. If -`rcu_walk` sees a dentry with DCACHE_NEED_AUTOMOUNT set but nothing -mounted, it *will* fall back to REF-walk. `d_manage()` cannot make the -VFS remain in RCU-walk mode, but can only tell it to get out of -RCU-walk mode by returning `-ECHILD`. - -So `d_manage()`, when called with `rcu_walk` set, should either return --ECHILD if there is any reason to believe it is unsafe to end the -mounted filesystem, and otherwise should return 0. - -autofs will return `-ECHILD` if an expiry of the filesystem has been -initiated or is being considered, otherwise it returns 0. - - -Mountpoint expiry ------------------ - -The VFS has a mechanism for automatically expiring unused mounts, -much as it can expire any unused dentry information from the dcache. -This is guided by the MNT_SHRINKABLE flag. This only applies to -mounts that were created by `d_automount()` returning a filesystem to be -mounted. As autofs doesn't return such a filesystem but leaves the -mounting to the automount daemon, it must involve the automount daemon -in unmounting as well. This also means that autofs has more control -of expiry. - -The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to -the `umount` system call. Unmounting with MNT_EXPIRE will fail unless -a previous attempt had been made, and the filesystem has been inactive -and untouched since that previous attempt. autofs4 does not depend on -this but has its own internal tracking of whether filesystems were -recently used. This allows individual names in the autofs directory -to expire separately. - -With version 4 of the protocol, the automount daemon can try to -unmount any filesystems mounted on the autofs filesystem or remove any -symbolic links or empty directories any time it likes. If the unmount -or removal is successful the filesystem will be returned to the state -it was before the mount or creation, so that any access of the name -will trigger normal auto-mount processing. In particlar, `rmdir` and -`unlink` do not leave negative entries in the dcache as a normal -filesystem would, so an attempt to access a recently-removed object is -passed to autofs for handling. - -With version 5, this is not safe except for unmounting from top-level -directories. As lower-level directories are never mount traps, other -processes will see an empty directory as soon as the filesystem is -unmounted. So it is generally safest to use the autofs expiry -protocol described below. - -Normally the daemon only wants to remove entries which haven't been -used for a while. For this purpose autofs maintains a "`last_used`" -time stamp on each directory or symlink. For symlinks it genuinely -does record the last time the symlink was "used" or followed to find -out where it points to. For directories the field is a slight -misnomer. It actually records the last time that autofs checked if -the directory or one of its descendents was busy and found that it -was. This is just as useful and doesn't require updating the field so -often. - -The daemon is able to ask autofs if anything is due to be expired, -using an `ioctl` as discussed later. For a *direct* mount, autofs -considers if the entire mount-tree can be unmounted or not. For an -*indirect* mount, autofs considers each of the names in the top level -directory to determine if any of those can be unmounted and cleaned -up. - -There is an option with indirect mounts to consider each of the leaves -that has been mounted on instead of considering the top-level names. -This is intended for compatability with version 4 of autofs and should -be considered as deprecated. - -When autofs considers a directory it checks the `last_used` time and -compares it with the "timeout" value set when the filesystem was -mounted, though this check is ignored in some cases. It also checks if -the directory or anything below it is in use. For symbolic links, -only the `last_used` time is ever considered. - -If both appear to support expiring the directory or symlink, an action -is taken. - -There are two ways to ask autofs to consider expiry. The first is to -use the **AUTOFS_IOC_EXPIRE** ioctl. This only works for indirect -mounts. If it finds something in the root directory to expire it will -return the name of that thing. Once a name has been returned the -automount daemon needs to unmount any filesystems mounted below the -name normally. As described above, this is unsafe for non-toplevel -mounts in a version-5 autofs. For this reason the current `automountd` -does not use this ioctl. - -The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or -the **AUTOFS_IOC_EXPIRE_MULTI** ioctl. This will work for both direct and -indirect mounts. If it selects an object to expire, it will notify -the daemon using the notification mechanism described below. This -will block until the daemon acknowledges the expiry notification. -This implies that the "`EXPIRE`" ioctl must be sent from a different -thread than the one which handles notification. - -While the ioctl is blocking, the entry is marked as "expiring" and -`d_manage` will block until the daemon affirms that the unmount has -completed (together with removing any directories that might have been -necessary), or has been aborted. - -Communicating with autofs: detecting the daemon ------------------------------------------------ - -There are several forms of communication between the automount daemon -and the filesystem. As we have already seen, the daemon can create and -remove directories and symlinks using normal filesystem operations. -autofs knows whether a process requesting some operation is the daemon -or not based on its process-group id number (see getpgid(1)). - -When an autofs filesystem is mounted the pgid of the mounting -processes is recorded unless the "pgrp=" option is given, in which -case that number is recorded instead. Any request arriving from a -process in that process group is considered to come from the daemon. -If the daemon ever has to be stopped and restarted a new pgid can be -provided through an ioctl as will be described below. - -Communicating with autofs: the event pipe ------------------------------------------ - -When an autofs filesystem is mounted, the 'write' end of a pipe must -be passed using the 'fd=' mount option. autofs will write -notification messages to this pipe for the daemon to respond to. -For version 5, the format of the message is: - - struct autofs_v5_packet { - int proto_version; /* Protocol version */ - int type; /* Type of packet */ - autofs_wqt_t wait_queue_token; - __u32 dev; - __u64 ino; - __u32 uid; - __u32 gid; - __u32 pid; - __u32 tgid; - __u32 len; - char name[NAME_MAX+1]; - }; - -where the type is one of - - autofs_ptype_missing_indirect - autofs_ptype_expire_indirect - autofs_ptype_missing_direct - autofs_ptype_expire_direct - -so messages can indicate that a name is missing (something tried to -access it but it isn't there) or that it has been selected for expiry. - -The pipe will be set to "packet mode" (equivalent to passing -`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at -most one packet, and any unread portion of a packet will be discarded. - -The `wait_queue_token` is a unique number which can identify a -particular request to be acknowledged. When a message is sent over -the pipe the affected dentry is marked as either "active" or -"expiring" and other accesses to it block until the message is -acknowledged using one of the ioctls below and the relevant -`wait_queue_token`. - -Communicating with autofs: root directory ioctls ------------------------------------------------- - -The root directory of an autofs filesystem will respond to a number of -ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN -capability, or must be the automount daemon. - -The available ioctl commands are: - -- **AUTOFS_IOC_READY**: a notification has been handled. The argument - to the ioctl command is the "wait_queue_token" number - corresponding to the notification being acknowledged. -- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with - the error code `ENOENT`. -- **AUTOFS_IOC_CATATONIC**: Causes the autofs to enter "catatonic" - mode meaning that it stops sending notifications to the daemon. - This mode is also entered if a write to the pipe fails. -- **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use. -- **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which - is really a version number for the implementation. It is - currently 2. -- **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned - long. The value is used to set the timeout for expiry, and - the current timeout value is stored back through the pointer. -- **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if - the filesystem could be unmounted. This is only a hint as - the situation could change at any instant. This call can be - use to avoid a more expensive full unmount attempt. -- **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is - anything suitable to expire. A pointer to a packet: - - struct autofs_packet_expire_multi { - int proto_version; /* Protocol version */ - int type; /* Type of packet */ - autofs_wqt_t wait_queue_token; - int len; - char name[NAME_MAX+1]; - }; - - is required. This is filled in with the name of something - that can be unmounted or removed. If nothing can be expired, - `errno` is set to `EAGAIN`. Even though a `wait_queue_token` - is present in the structure, no "wait queue" is established - and no acknowledgment is needed. -- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to - **AUTOFS_IOC_EXPIRE** except that it causes notification to be - sent to the daemon, and it blocks until the daemon acknowledges. - The argument is an integer which can contain two different flags. - - **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored - and objects are expired if the are not in use. - - **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level - name to expire. This is only safe when *maxproto* is 4. - -Communicating with autofs: char-device ioctls ---------------------------------------------- - -It is not always possible to open the root of an autofs filesystem, -particularly a *direct* mounted filesystem. If the automount daemon -is restarted there is no way for it to regain control of existing -mounts using any of the above communication channels. To address this -need there is a "miscellaneous" character device (major 10, minor 235) -which can be used to communicate directly with the autofs filesystem. -It requires CAP_SYS_ADMIN for access. - -The `ioctl`s that can be used on this device are described in a separate -document `autofs4-mount-control.txt`, and are summarized briefly here. -Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: - - struct autofs_dev_ioctl { - __u32 ver_major; - __u32 ver_minor; - __u32 size; /* total size of data passed in - * including this struct */ - __s32 ioctlfd; /* automount command fd */ - - /* Command parameters */ - union { - struct args_protover protover; - struct args_protosubver protosubver; - struct args_openmount openmount; - struct args_ready ready; - struct args_fail fail; - struct args_setpipefd setpipefd; - struct args_timeout timeout; - struct args_requester requester; - struct args_expire expire; - struct args_askumount askumount; - struct args_ismountpoint ismountpoint; - }; - - char path[0]; - }; - -For the **OPEN_MOUNT** and **IS_MOUNTPOINT** commands, the target -filesystem is identified by the `path`. All other commands identify -the filesystem by the `ioctlfd` which is a file descriptor open on the -root, and which can be returned by **OPEN_MOUNT**. - -The `ver_major` and `ver_minor` are in/out parameters which check that -the requested version is supported, and report the maximum version -that the kernel module can support. - -Commands are: - -- **AUTOFS_DEV_IOCTL_VERSION_CMD**: does nothing, except validate and - set version numbers. -- **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor - on the root of an autofs filesystem. The filesystem is identified - by name and device number, which is stored in `openmount.devid`. - Device numbers for existing filesystems can be found in - `/proc/self/mountinfo`. -- **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`. -- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in - catatonic mode, this can provide the write end of a new pipe - in `setpipefd.pipefd` to re-establish communication with a daemon. - The process group of the calling process is used to identify the - daemon. -- **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a - name within the filesystem that has been auto-mounted on. - On successful return, `requester.uid` and `requester.gid` will be - the UID and GID of the process which triggered that mount. -- **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a - mountpoint of a particular type - see separate documentation for - details. -- **AUTOFS_DEV_IOCTL_PROTOVER_CMD**: -- **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**: -- **AUTOFS_DEV_IOCTL_READY_CMD**: -- **AUTOFS_DEV_IOCTL_FAIL_CMD**: -- **AUTOFS_DEV_IOCTL_CATATONIC_CMD**: -- **AUTOFS_DEV_IOCTL_TIMEOUT_CMD**: -- **AUTOFS_DEV_IOCTL_EXPIRE_CMD**: -- **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**: These all have the same - function as the similarly named **AUTOFS_IOC** ioctls, except - that **FAIL** can be given an explicit error number in `fail.status` - instead of assuming `ENOENT`, and this **EXPIRE** command - corresponds to **AUTOFS_IOC_EXPIRE_MULTI**. - -Catatonic mode --------------- - -As mentioned, an autofs mount can enter "catatonic" mode. This -happens if a write to the notification pipe fails, or if it is -explicitly requested by an `ioctl`. - -When entering catatonic mode, the pipe is closed and any pending -notifications are acknowledged with the error `ENOENT`. - -Once in catatonic mode attempts to access non-existing names will -result in `ENOENT` while attempts to access existing directories will -be treated in the same way as if they came from the daemon, so mount -traps will not fire. - -When the filesystem is mounted a _uid_ and _gid_ can be given which -set the ownership of directories and symbolic links. When the -filesystem is in catatonic mode, any process with a matching UID can -create directories or symlinks in the root directory, but not in other -directories. - -Catatonic mode can only be left via the -**AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`. - -autofs, name spaces, and shared mounts --------------------------------------- - -With bind mounts and name spaces it is possible for an autofs -filesystem to appear at multiple places in one or more filesystem -name spaces. For this to work sensibly, the autofs filesystem should -always be mounted "shared". e.g. - -> `mount --make-shared /autofs/mount/point` - -The automount daemon is only able to manage a single mount location for -an autofs filesystem and if mounts on that are not 'shared', other -locations will not behave as expected. In particular access to those -other locations will likely result in the `ELOOP` error - -> Too many levels of symbolic links -- cgit v1.2.3 From b6bb226a72f0b95f0ce3edfc108776a274d21a98 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:38 -0700 Subject: autofs: use autofs instead of autofs4 in documentation Finally remove autofs4 references in the filesystems documentation. Link: http://lkml.kernel.org/r/152626709055.28589.416082809460051475.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/00-INDEX | 4 ++-- Documentation/filesystems/autofs-mount-control.txt | 8 ++++---- Documentation/filesystems/autofs.txt | 10 +++++----- Documentation/filesystems/automount-support.txt | 2 +- Documentation/filesystems/path-lookup.md | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index b7bd6c9009cc..a8bd4af7fbce 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -10,8 +10,8 @@ afs.txt - info and examples for the distributed AFS (Andrew File System) fs. affs.txt - info and mount options for the Amiga Fast File System. -autofs4-mount-control.txt - - info on device control operations for autofs4 module. +autofs-mount-control.txt + - info on device control operations for autofs module. automount-support.txt - information about filesystem automount support. befs.txt diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt index 52c1b6cdfc91..45edad6933cc 100644 --- a/Documentation/filesystems/autofs-mount-control.txt +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -1,5 +1,5 @@ -Miscellaneous Device control operations for the autofs4 kernel module +Miscellaneous Device control operations for the autofs kernel module ==================================================================== The problem @@ -164,7 +164,7 @@ possibility for future development due to the requirements of the message bus architecture. -autofs4 Miscellaneous Device mount control interface +autofs Miscellaneous Device mount control interface ==================================================== The control interface is opening a device node, typically /dev/autofs. @@ -244,7 +244,7 @@ The device node ioctl operations implemented by this interface are: AUTOFS_DEV_IOCTL_VERSION ------------------------ -Get the major and minor version of the autofs4 device ioctl kernel module +Get the major and minor version of the autofs device ioctl kernel module implementation. It requires an initialized struct autofs_dev_ioctl as an input parameter and sets the version information in the passed in structure. It returns 0 on success or the error -EINVAL if a version mismatch is @@ -254,7 +254,7 @@ detected. AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD ------------------------------------------------------------------ -Get the major and minor version of the autofs4 protocol version understood +Get the major and minor version of the autofs protocol version understood by loaded module. This call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to a valid autofs mount point descriptor and sets the requested version number in version field of struct args_protover diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt index f10dd590f69f..373ad25852d3 100644 --- a/Documentation/filesystems/autofs.txt +++ b/Documentation/filesystems/autofs.txt @@ -30,15 +30,15 @@ key advantages: Context ------- -The "autofs4" filesystem module is only one part of an autofs system. +The "autofs" filesystem module is only one part of an autofs system. There also needs to be a user-space program which looks up names and mounts filesystems. This will often be the "automount" program, -though other tools including "systemd" can make use of "autofs4". +though other tools including "systemd" can make use of "autofs". This document describes only the kernel module and the interactions required with any user-space program. Subsequent text refers to this as the "automount daemon" or simply "the daemon". -"autofs4" is a Linux kernel module with provides the "autofs" +"autofs" is a Linux kernel module with provides the "autofs" filesystem type. Several "autofs" filesystems can be mounted and they can each be managed separately, or all managed by the same daemon. @@ -215,7 +215,7 @@ of expiry. The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to the `umount` system call. Unmounting with MNT_EXPIRE will fail unless a previous attempt had been made, and the filesystem has been inactive -and untouched since that previous attempt. autofs4 does not depend on +and untouched since that previous attempt. autofs does not depend on this but has its own internal tracking of whether filesystems were recently used. This allows individual names in the autofs directory to expire separately. @@ -415,7 +415,7 @@ which can be used to communicate directly with the autofs filesystem. It requires CAP_SYS_ADMIN for access. The `ioctl`s that can be used on this device are described in a separate -document `autofs4-mount-control.txt`, and are summarized briefly here. +document `autofs-mount-control.txt`, and are summarized briefly here. Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: struct autofs_dev_ioctl { diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt index 7eb762eb3136..b0afd3d55eaf 100644 --- a/Documentation/filesystems/automount-support.txt +++ b/Documentation/filesystems/automount-support.txt @@ -9,7 +9,7 @@ also be requested by userspace. IN-KERNEL AUTOMOUNTING ====================== -See section "Mount Traps" of Documentation/filesystems/autofs4.txt +See section "Mount Traps" of Documentation/filesystems/autofs.txt Then from userspace, you can just do something like: diff --git a/Documentation/filesystems/path-lookup.md b/Documentation/filesystems/path-lookup.md index 1933ef734e63..e2edd45c4bc0 100644 --- a/Documentation/filesystems/path-lookup.md +++ b/Documentation/filesystems/path-lookup.md @@ -460,7 +460,7 @@ this retry process in the next article. Automount points are locations in the filesystem where an attempt to lookup a name can trigger changes to how that lookup should be handled, in particular by mounting a filesystem there. These are -covered in greater detail in autofs4.txt in the Linux documentation +covered in greater detail in autofs.txt in the Linux documentation tree, but a few notes specifically related to path lookup are in order here. -- cgit v1.2.3