From 03dd4cb26d967f9588437b0fc9cc0e8353322bb7 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Fri, 25 Mar 2016 03:53:42 -0300 Subject: Linux-libre 4.5-gnu --- Documentation/ABI/testing/configfs-iio | 21 + Documentation/ABI/testing/configfs-rdma_cm | 22 + .../ABI/testing/configfs-usb-gadget-sourcesink | 2 + Documentation/ABI/testing/configfs-usb-gadget-tcm | 6 + Documentation/ABI/testing/debugfs-aufs | 50 - Documentation/ABI/testing/sysfs-aufs | 31 - Documentation/ABI/testing/sysfs-bus-iio-ina2xx-adc | 24 + Documentation/ABI/testing/sysfs-bus-usb | 11 + Documentation/ABI/testing/sysfs-class-infiniband | 16 + Documentation/ABI/testing/sysfs-class-net-cdc_ncm | 19 + Documentation/ABI/testing/sysfs-class-net-mesh | 4 +- Documentation/ABI/testing/sysfs-class-net-qmi | 23 + Documentation/ABI/testing/sysfs-class-watchdog | 51 + Documentation/ABI/testing/sysfs-fs-f2fs | 6 + Documentation/ABI/testing/sysfs-kernel-livepatch | 6 +- Documentation/ABI/testing/sysfs-ptp | 2 +- Documentation/CodingStyle | 2 +- Documentation/DMA-API-HOWTO.txt | 10 - Documentation/DMA-API.txt | 2 +- Documentation/DocBook/Makefile | 10 +- Documentation/DocBook/device-drivers.tmpl | 85 +- Documentation/DocBook/gpu.tmpl | 1003 +------ Documentation/DocBook/iio.tmpl | 2 +- Documentation/DocBook/media/Makefile | 6 +- Documentation/DocBook/media/dvb/dvbproperty.xml | 2 +- Documentation/DocBook/media/dvb/examples.xml | 2 +- Documentation/DocBook/media/dvb/intro.xml | 2 +- Documentation/DocBook/media/v4l/capture.c.xml | 2 +- Documentation/DocBook/media/v4l/compat.xml | 2 +- Documentation/DocBook/media/v4l/io.xml | 10 +- .../DocBook/media/v4l/media-controller.xml | 44 +- .../DocBook/media/v4l/media-ioc-enum-entities.xml | 104 +- .../DocBook/media/v4l/media-ioc-enum-links.xml | 56 - .../DocBook/media/v4l/media-ioc-g-topology.xml | 394 +++ Documentation/DocBook/media/v4l/media-types.xml | 236 ++ Documentation/DocBook/media/v4l/v4l2.xml | 10 + .../DocBook/media/v4l/vidioc-create-bufs.xml | 30 +- .../DocBook/media/v4l/vidioc-dbg-g-chip-info.xml | 2 +- .../DocBook/media/v4l/vidioc-dbg-g-register.xml | 2 +- Documentation/DocBook/media/v4l/vidioc-enumstd.xml | 2 +- .../DocBook/media/v4l/vidioc-g-ext-ctrls.xml | 28 +- Documentation/DocBook/media_api.tmpl | 6 +- Documentation/DocBook/mtdnand.tmpl | 35 +- Documentation/HOWTO | 2 +- Documentation/Intel-IOMMU.txt | 2 +- Documentation/Makefile | 2 +- .../RCU/Design/Requirements/2013-08-is-it-dead.png | Bin 0 -> 100825 bytes .../Design/Requirements/GPpartitionReaders1.svg | 374 +++ .../RCU/Design/Requirements/RCUApplicability.svg | 237 ++ .../Design/Requirements/ReadersPartitionGP1.svg | 639 +++++ .../RCU/Design/Requirements/Requirements.html | 2897 ++++++++++++++++++++ .../RCU/Design/Requirements/Requirements.htmlx | 2741 ++++++++++++++++++ Documentation/RCU/Design/htmlqqz.sh | 108 + Documentation/accounting/getdelays.c | 3 +- Documentation/arm/Marvell/README | 19 +- Documentation/arm/pxa/mfp.txt | 26 +- Documentation/arm64/silicon-errata.txt | 58 + Documentation/block/cfq-iosched.txt | 15 +- Documentation/cgroup-v1/00-INDEX | 28 + Documentation/cgroup-v1/blkio-controller.txt | 375 +++ Documentation/cgroup-v1/cgroups.txt | 682 +++++ Documentation/cgroup-v1/cpuacct.txt | 49 + Documentation/cgroup-v1/cpusets.txt | 839 ++++++ Documentation/cgroup-v1/devices.txt | 116 + Documentation/cgroup-v1/freezer-subsystem.txt | 123 + Documentation/cgroup-v1/hugetlb.txt | 45 + Documentation/cgroup-v1/memcg_test.txt | 280 ++ Documentation/cgroup-v1/memory.txt | 876 ++++++ Documentation/cgroup-v1/net_cls.txt | 39 + Documentation/cgroup-v1/net_prio.txt | 55 + Documentation/cgroup-v1/pids.txt | 85 + Documentation/cgroup-v2.txt | 1386 ++++++++++ Documentation/cgroups/00-INDEX | 30 - Documentation/cgroups/blkio-controller.txt | 455 --- Documentation/cgroups/cgroups.txt | 682 ----- Documentation/cgroups/cpuacct.txt | 49 - Documentation/cgroups/cpusets.txt | 839 ------ Documentation/cgroups/devices.txt | 116 - Documentation/cgroups/freezer-subsystem.txt | 123 - Documentation/cgroups/hugetlb.txt | 45 - Documentation/cgroups/memcg_test.txt | 280 -- Documentation/cgroups/memory.txt | 876 ------ Documentation/cgroups/net_cls.txt | 39 - Documentation/cgroups/net_prio.txt | 55 - Documentation/cgroups/pids.txt | 85 - Documentation/cgroups/unified-hierarchy.txt | 647 ----- Documentation/cpu-freq/intel-pstate.txt | 241 +- Documentation/cpu-freq/pcc-cpufreq.txt | 4 +- Documentation/cpu-hotplug.txt | 2 +- Documentation/device-mapper/verity.txt | 40 +- Documentation/devicetree/bindings/arm/arm,scpi.txt | 2 +- .../devicetree/bindings/arm/bcm/brcm,bcm2835.txt | 4 + .../devicetree/bindings/arm/bcm/brcm,bcm4708.txt | 7 + .../bindings/arm/bcm/brcm,nsp-cpu-method.txt | 39 + .../devicetree/bindings/arm/compulab-boards.txt | 25 + Documentation/devicetree/bindings/arm/cpus.txt | 21 + .../devicetree/bindings/arm/exynos/smp-sysram.txt | 38 - Documentation/devicetree/bindings/arm/fsl.txt | 4 + .../bindings/arm/hisilicon/hisilicon.txt | 16 + Documentation/devicetree/bindings/arm/l2c2x0.txt | 105 + Documentation/devicetree/bindings/arm/l2cc.txt | 93 - .../devicetree/bindings/arm/marvell,kirkwood.txt | 2 + Documentation/devicetree/bindings/arm/mediatek.txt | 4 + .../bindings/arm/mediatek/mediatek,infracfg.txt | 2 +- .../bindings/arm/mediatek/mediatek,pericfg.txt | 2 +- .../devicetree/bindings/arm/omap/omap.txt | 19 + Documentation/devicetree/bindings/arm/pmu.txt | 5 +- Documentation/devicetree/bindings/arm/psci.txt | 25 +- Documentation/devicetree/bindings/arm/rockchip.txt | 26 + .../devicetree/bindings/arm/rockchip/pmu-sram.txt | 16 - .../devicetree/bindings/arm/rockchip/smp-sram.txt | 30 - .../devicetree/bindings/arm/samsung/exynos-adc.txt | 3 + Documentation/devicetree/bindings/arm/scu.txt | 3 + Documentation/devicetree/bindings/arm/secure.txt | 53 + Documentation/devicetree/bindings/arm/shmobile.txt | 4 + .../devicetree/bindings/arm/technologic.txt | 6 + .../devicetree/bindings/ata/brcm,sata-brcmstb.txt | 4 +- .../devicetree/bindings/ata/sata_rcar.txt | 1 + .../bindings/bus/uniphier-system-bus.txt | 66 + .../devicetree/bindings/clock/arm-syscon-icst.txt | 40 + .../bindings/clock/brcm,bcm2835-aux-clock.txt | 31 + .../bindings/clock/brcm,iproc-clocks.txt | 5 + .../devicetree/bindings/clock/cs2000-cp.txt | 22 + .../bindings/clock/dove-divider-clock.txt | 28 + .../bindings/clock/nvidia,tegra210-car.txt | 56 + .../devicetree/bindings/clock/nxp,lpc3220-clk.txt | 30 + .../bindings/clock/nxp,lpc3220-usb-clk.txt | 22 + .../devicetree/bindings/clock/qcom,gcc.txt | 1 + .../devicetree/bindings/clock/qcom,mmcc.txt | 1 + .../bindings/clock/renesas,cpg-div6-clocks.txt | 4 + .../bindings/clock/renesas,h8300-div-clock.txt | 2 +- .../bindings/clock/rockchip,rk3036-cru.txt | 56 + .../bindings/clock/rockchip,rk3228-cru.txt | 58 + .../devicetree/bindings/clock/samsung,s2mps11.txt | 49 + Documentation/devicetree/bindings/clock/sunxi.txt | 10 + .../devicetree/bindings/clock/tango4-clock.txt | 23 + .../bindings/cpufreq/arm_big_little_dt.txt | 2 +- .../devicetree/bindings/cpufreq/cpufreq-dt.txt | 2 +- .../devicetree/bindings/cpufreq/cpufreq-st.txt | 91 + .../devicetree/bindings/crypto/rockchip-crypto.txt | 29 + .../devicetree/bindings/display/bridge/tda998x.txt | 4 + .../bindings/display/etnaviv/etnaviv-drm.txt | 54 + .../bindings/display/exynos/exynos_dp.txt | 41 +- .../bindings/display/exynos/exynos_hdmi.txt | 7 +- .../devicetree/bindings/display/msm/dsi.txt | 12 +- .../devicetree/bindings/display/msm/mdp.txt | 26 +- .../bindings/display/panel/boe,tv080wum-nl0.txt | 7 + .../bindings/display/panel/innolux,g121x1-l03.txt | 7 + .../bindings/display/panel/kyo,tcg121xglp.txt | 7 + .../display/panel/panasonic,vvx10f034n00.txt | 20 + .../display/panel/qiaodian,qd43003c0-40.txt | 7 + .../bindings/display/panel/sharp,ls043t1le01.txt | 22 + .../display/panel/startek,startek-kd050c.txt | 4 + .../display/rockchip/dw_mipi_dsi_rockchip.txt | 60 + .../bindings/display/rockchip/rockchip-vop.txt | 1 + .../bindings/display/simple-framebuffer.txt | 13 +- .../devicetree/bindings/dma/renesas,rcar-dmac.txt | 13 +- .../devicetree/bindings/dma/renesas,usb-dmac.txt | 10 +- .../devicetree/bindings/dma/stm32-dma.txt | 82 + .../devicetree/bindings/dma/ti-dma-crossbar.txt | 6 + .../devicetree/bindings/eeprom/eeprom.txt | 21 +- .../devicetree/bindings/extcon/extcon-arizona.txt | 60 + .../devicetree/bindings/extcon/extcon-max3355.txt | 21 + .../devicetree/bindings/gpio/gpio-pca953x.txt | 1 + .../devicetree/bindings/gpio/gpio-sx150x.txt | 3 +- .../devicetree/bindings/gpio/gpio-tps65086.txt | 16 + .../devicetree/bindings/gpio/snps-dwapb-gpio.txt | 2 +- Documentation/devicetree/bindings/i2c/i2c-at91.txt | 5 +- .../devicetree/bindings/i2c/i2c-brcmstb.txt | 2 +- Documentation/devicetree/bindings/i2c/i2c-rcar.txt | 4 + Documentation/devicetree/bindings/i2c/i2c.txt | 36 +- .../devicetree/bindings/i2c/trivial-devices.txt | 16 +- .../devicetree/bindings/iio/accel/mma8452.txt | 6 + .../devicetree/bindings/iio/adc/imx7d-adc.txt | 22 + .../devicetree/bindings/iio/adc/mcp320x.txt | 30 +- .../devicetree/bindings/iio/adc/mcp3422.txt | 3 +- .../devicetree/bindings/iio/adc/palmas-gpadc.txt | 48 + .../devicetree/bindings/iio/adc/ti-adc128s052.txt | 4 +- .../devicetree/bindings/iio/adc/ti-ads8688.txt | 20 + .../devicetree/bindings/iio/health/max30100.txt | 21 + .../devicetree/bindings/iio/light/us5182d.txt | 11 + .../devicetree/bindings/iio/st-sensors.txt | 1 + .../devicetree/bindings/input/gpio-keys.txt | 1 + .../allwinner,sun67i-sc-nmi.txt | 27 - .../interrupt-controller/allwinner,sunxi-nmi.txt | 27 + .../bindings/interrupt-controller/arm,gic-v3.txt | 5 +- .../bindings/interrupt-controller/arm,gic.txt | 1 + .../interrupt-controller/hisilicon,mbigen-v2.txt | 74 + .../interrupt-controller/mediatek,sysirq.txt | 1 + .../interrupt-controller/microchip,pic32-evic.txt | 67 + .../interrupt-controller/qca,ath79-misc-intc.txt | 2 +- .../interrupt-controller/technologic,ts4800.txt | 16 + .../bindings/iommu/renesas,ipmmu-vmsa.txt | 12 +- .../devicetree/bindings/media/exynos5-gsc.txt | 4 + .../devicetree/bindings/media/i2c/adp1653.txt | 7 +- .../bindings/media/stih407-c8sectpfe.txt | 20 +- .../memory-controllers/ath79-ddr-controller.txt | 8 +- Documentation/devicetree/bindings/mfd/arizona.txt | 24 +- Documentation/devicetree/bindings/mfd/palmas.txt | 2 +- Documentation/devicetree/bindings/mfd/s2mpa01.txt | 90 - Documentation/devicetree/bindings/mfd/s2mps11.txt | 153 -- .../devicetree/bindings/mfd/samsung,sec-core.txt | 88 + Documentation/devicetree/bindings/mfd/syscon.txt | 4 + .../bindings/mips/pic32/microchip,pic32mzda.txt | 31 + Documentation/devicetree/bindings/misc/sram.txt | 67 - .../devicetree/bindings/mmc/renesas,mmcif.txt | 1 + .../devicetree/bindings/mtd/brcm,brcmnand.txt | 32 + .../devicetree/bindings/mtd/fsl-quadspi.txt | 3 +- .../bindings/mtd/ingenic,jz4780-nand.txt | 86 + .../devicetree/bindings/mtd/jedec,spi-nor.txt | 56 +- .../devicetree/bindings/mtd/mtk-quadspi.txt | 41 + .../devicetree/bindings/mtd/partition.txt | 2 + .../devicetree/bindings/net/brcm,bcmgenet.txt | 4 +- .../devicetree/bindings/net/cdns-emac.txt | 20 - Documentation/devicetree/bindings/net/dsa/dsa.txt | 3 + .../devicetree/bindings/net/hisilicon-hns-dsaf.txt | 5 +- .../devicetree/bindings/net/hisilicon-hns-mdio.txt | 7 +- .../devicetree/bindings/net/hisilicon-hns-nic.txt | 7 +- .../devicetree/bindings/net/ieee802154/adf7242.txt | 18 + Documentation/devicetree/bindings/net/macb.txt | 11 + .../bindings/net/marvell-armada-370-neta.txt | 7 +- .../devicetree/bindings/net/mdio-mux-gpio.txt | 8 - Documentation/devicetree/bindings/net/mdio-mux.txt | 8 - .../bindings/net/mediatek,mt7620-gsw.txt | 26 + .../devicetree/bindings/net/micrel-ksz90x1.txt | 17 +- .../devicetree/bindings/net/nfc/st95hf.txt | 50 + Documentation/devicetree/bindings/net/phy.txt | 6 +- .../devicetree/bindings/net/ralink,rt2880-net.txt | 61 + .../devicetree/bindings/net/ralink,rt3050-esw.txt | 32 + .../devicetree/bindings/net/renesas,ravb.txt | 16 +- .../devicetree/bindings/net/socfpga-dwmac.txt | 2 + Documentation/devicetree/bindings/net/stmmac.txt | 25 +- Documentation/devicetree/bindings/opp/opp.txt | 132 +- .../devicetree/bindings/pci/brcm,iproc-pcie.txt | 40 +- .../devicetree/bindings/pci/hisilicon-pcie.txt | 8 +- .../devicetree/bindings/pci/pci-rcar-gen2.txt | 18 +- .../devicetree/bindings/pci/qcom,pcie.txt | 233 ++ Documentation/devicetree/bindings/pci/rcar-pci.txt | 15 +- .../bindings/phy/brcm,brcmstb-sata-phy.txt | 1 + .../devicetree/bindings/phy/phy-ath79-usb.txt | 18 + .../devicetree/bindings/phy/phy-hi6220-usb.txt | 16 + .../devicetree/bindings/phy/rcar-gen3-phy-usb2.txt | 39 + .../devicetree/bindings/phy/rockchip-usb-phy.txt | 6 +- .../devicetree/bindings/phy/sun4i-usb-phy.txt | 1 + Documentation/devicetree/bindings/phy/ti-phy.txt | 20 +- .../bindings/pinctrl/allwinner,sunxi-pinctrl.txt | 3 + .../bindings/pinctrl/brcm,cygnus-gpio.txt | 104 - .../bindings/pinctrl/brcm,iproc-gpio.txt | 109 + .../devicetree/bindings/pinctrl/brcm,nsp-gpio.txt | 80 + .../bindings/pinctrl/lantiq,pinctrl-xway.txt | 110 +- .../devicetree/bindings/pinctrl/pinctrl-mt65xx.txt | 9 +- .../bindings/pinctrl/qcom,msm8996-pinctrl.txt | 199 ++ .../devicetree/bindings/pinctrl/qcom,pmic-gpio.txt | 2 + .../devicetree/bindings/pinctrl/qcom,pmic-mpp.txt | 1 + .../bindings/pinctrl/rockchip,pinctrl.txt | 3 +- .../bindings/pinctrl/samsung-pinctrl.txt | 1 + .../devicetree/bindings/pwm/lpc32xx-pwm.txt | 9 +- .../devicetree/bindings/pwm/pwm-omap-dmtimer.txt | 18 + .../bindings/regulator/lm363x-regulator.txt | 34 + .../devicetree/bindings/regulator/pv88060.txt | 124 + .../devicetree/bindings/regulator/pv88090.txt | 65 + .../bindings/regulator/qcom,smd-rpm-regulator.txt | 159 ++ .../bindings/regulator/s5m8767-regulator.txt | 163 -- .../bindings/regulator/samsung,s2mpa01.txt | 79 + .../bindings/regulator/samsung,s2mps11.txt | 102 + .../bindings/regulator/samsung,s5m8767.txt | 145 + .../bindings/reset/hisilicon,hi6220-reset.txt | 34 + Documentation/devicetree/bindings/rtc/s3c-rtc.txt | 6 + .../devicetree/bindings/scsi/hisilicon-sas.txt | 69 + Documentation/devicetree/bindings/serial/8250.txt | 1 - .../devicetree/bindings/serial/fsl-imx-uart.txt | 2 +- .../devicetree/bindings/serial/mtk-uart.txt | 14 +- .../bindings/serial/renesas,sci-serial.txt | 44 +- .../bindings/soc/bcm/raspberrypi,bcm2835-power.txt | 47 + Documentation/devicetree/bindings/soc/dove/pmu.txt | 56 + .../devicetree/bindings/soc/mediatek/scpsys.txt | 12 + .../devicetree/bindings/soc/qcom,smd-rpm.txt | 117 - .../devicetree/bindings/soc/qcom/qcom,smd-rpm.txt | 58 + .../devicetree/bindings/soc/qcom/qcom,smp2p.txt | 104 + .../devicetree/bindings/soc/qcom/qcom,smsm.txt | 104 + .../devicetree/bindings/soc/sunxi/sram.txt | 72 - .../devicetree/bindings/soc/ti/wkup_m3_ipc.txt | 57 + Documentation/devicetree/bindings/sound/ak4613.txt | 10 + .../devicetree/bindings/sound/atmel-classd.txt | 6 + .../devicetree/bindings/sound/atmel-pdmic.txt | 55 + Documentation/devicetree/bindings/sound/da7218.txt | 104 + Documentation/devicetree/bindings/sound/da7219.txt | 8 +- .../devicetree/bindings/sound/fsl,asrc.txt | 5 + .../devicetree/bindings/sound/fsl,esai.txt | 5 + .../devicetree/bindings/sound/fsl,spdif.txt | 5 + .../devicetree/bindings/sound/fsl-asoc-card.txt | 2 + .../devicetree/bindings/sound/img,i2s-in.txt | 47 + .../devicetree/bindings/sound/img,i2s-out.txt | 51 + .../devicetree/bindings/sound/img,parallel-out.txt | 44 + .../bindings/sound/img,pistachio-internal-dac.txt | 18 + .../devicetree/bindings/sound/img,spdif-in.txt | 41 + .../devicetree/bindings/sound/img,spdif-out.txt | 44 + .../devicetree/bindings/sound/inno-rk3036.txt | 20 + .../devicetree/bindings/sound/pcm1792a.txt | 18 - .../devicetree/bindings/sound/pcm179x.txt | 18 + .../devicetree/bindings/sound/renesas,rsnd.txt | 82 + .../bindings/sound/renesas,rsrc-card.txt | 4 +- .../devicetree/bindings/sound/rockchip-i2s.txt | 2 + Documentation/devicetree/bindings/sound/rt5616.txt | 26 + Documentation/devicetree/bindings/sound/rt5651.txt | 41 + Documentation/devicetree/bindings/sound/rt5659.txt | 75 + Documentation/devicetree/bindings/sound/rt5677.txt | 2 +- .../devicetree/bindings/sound/sun4i-codec.txt | 3 + .../devicetree/bindings/sound/ti,pcm3168a.txt | 48 + .../devicetree/bindings/sound/wlf,wm8974.txt | 15 + Documentation/devicetree/bindings/sound/wm8994.txt | 2 +- Documentation/devicetree/bindings/spi/sh-msiof.txt | 1 + .../devicetree/bindings/spi/spi-mt65xx.txt | 9 +- Documentation/devicetree/bindings/spi/ti_qspi.txt | 22 +- .../devicetree/bindings/sram/rockchip-pmu-sram.txt | 16 + .../devicetree/bindings/sram/rockchip-smp-sram.txt | 30 + .../devicetree/bindings/sram/samsung-sram.txt | 38 + Documentation/devicetree/bindings/sram/sram.txt | 67 + .../devicetree/bindings/sram/sunxi-sram.txt | 72 + .../devicetree/bindings/staging/ion/hi6220-ion.txt | 31 + .../devicetree/bindings/thermal/qoriq-thermal.txt | 63 + .../devicetree/bindings/thermal/rcar-thermal.txt | 37 +- .../bindings/thermal/rockchip-thermal.txt | 2 + .../bindings/timer/mediatek,mtk-timer.txt | 1 + Documentation/devicetree/bindings/usb/dwc2.txt | 1 + .../devicetree/bindings/usb/dwc3-xilinx.txt | 33 + .../devicetree/bindings/usb/mt8173-xhci.txt | 51 + .../devicetree/bindings/usb/octeon-usb.txt | 62 + .../devicetree/bindings/usb/renesas_usb3.txt | 23 + .../devicetree/bindings/usb/renesas_usbhs.txt | 22 +- Documentation/devicetree/bindings/usb/usb-xhci.txt | 4 +- Documentation/devicetree/bindings/usb/usb3503.txt | 5 +- .../devicetree/bindings/vendor-prefixes.txt | 8 + .../bindings/watchdog/alphascale-asm9260.txt | 35 + .../devicetree/bindings/watchdog/meson-wdt.txt | 13 + .../devicetree/bindings/watchdog/meson6-wdt.txt | 13 - .../devicetree/bindings/watchdog/mt7621-wdt.txt | 12 + .../devicetree/bindings/watchdog/mtk-wdt.txt | 6 +- .../bindings/watchdog/sigma,smp8642-wdt.txt | 18 + .../devicetree/bindings/watchdog/sp805-wdt.txt | 31 + .../devicetree/bindings/watchdog/ts4800-wdt.txt | 25 + .../devicetree/bindings/watchdog/ziirave-wdt.txt | 19 + Documentation/dmaengine/client.txt | 59 +- Documentation/dmaengine/provider.txt | 20 +- Documentation/dvb/README.dvb-usb | 4 +- Documentation/dvb/faq.txt | 2 +- Documentation/dvb/readme.txt | 10 +- Documentation/edac.txt | 10 +- .../fault-injection/notifier-error-inject.txt | 25 + .../features/io/dma_map_attrs/arch-support.txt | 40 - .../seccomp/seccomp-filter/arch-support.txt | 2 +- .../features/time/irq-time-acct/arch-support.txt | 2 +- .../vm/pmdp_splitting_flush/arch-support.txt | 40 - Documentation/filesystems/Locking | 6 +- Documentation/filesystems/aufs/README | 391 --- Documentation/filesystems/aufs/design/01intro.txt | 157 -- Documentation/filesystems/aufs/design/02struct.txt | 245 -- .../filesystems/aufs/design/03atomic_open.txt | 72 - Documentation/filesystems/aufs/design/03lookup.txt | 100 - Documentation/filesystems/aufs/design/04branch.txt | 61 - .../filesystems/aufs/design/05wbr_policy.txt | 51 - Documentation/filesystems/aufs/design/06fhsm.txt | 105 - Documentation/filesystems/aufs/design/06mmap.txt | 59 - Documentation/filesystems/aufs/design/06xattr.txt | 81 - Documentation/filesystems/aufs/design/07export.txt | 45 - Documentation/filesystems/aufs/design/08shwh.txt | 39 - Documentation/filesystems/aufs/design/10dynop.txt | 34 - Documentation/filesystems/configfs/configfs.txt | 57 +- Documentation/filesystems/f2fs.txt | 10 +- Documentation/filesystems/porting | 21 + Documentation/filesystems/proc.txt | 38 +- Documentation/filesystems/sharedsubtree.txt | 2 +- Documentation/filesystems/tmpfs.txt | 8 +- Documentation/filesystems/vfat.txt | 10 + Documentation/filesystems/vfs.txt | 21 +- Documentation/gpio/consumer.txt | 2 +- Documentation/gpio/driver.txt | 6 +- Documentation/gpio/drivers-on-gpio.txt | 6 +- Documentation/hwmon/htu21 | 46 - Documentation/hwmon/ltc3815 | 61 + Documentation/iio/iio_configfs.txt | 93 + Documentation/infiniband/core_locking.txt | 2 - Documentation/ioctl/botching-up-ioctls.txt | 6 +- Documentation/ja_JP/HOWTO | 3 +- Documentation/kernel-docs.txt | 2 +- Documentation/kernel-parameters.txt | 156 +- Documentation/kernel-per-CPU-kthreads.txt | 2 +- Documentation/ko_KR/HOWTO | 29 +- Documentation/leds/leds-class.txt | 13 + Documentation/md-cluster.txt | 314 ++- Documentation/media-framework.txt | 372 --- Documentation/memory-barriers.txt | 40 +- Documentation/mtd/nand_ecc.txt | 58 +- Documentation/networking/batman-adv.txt | 9 +- Documentation/networking/can.txt | 9 + Documentation/networking/ip-sysctl.txt | 33 +- Documentation/networking/switchdev.txt | 8 +- Documentation/power/pci.txt | 2 +- Documentation/power/runtime_pm.txt | 6 + Documentation/power/tuxonice-internals.txt | 532 ---- Documentation/power/tuxonice.txt | 948 ------- Documentation/printk-formats.txt | 15 +- Documentation/s390/zfcpdump.txt | 22 +- .../scsi/link_power_management_policy.txt | 5 +- Documentation/security/keys-trusted-encrypted.txt | 31 +- Documentation/sound/alsa/img,spdif-in.txt | 49 + Documentation/spi/.gitignore | 2 - Documentation/spi/00-INDEX | 4 - Documentation/spi/Makefile | 8 - Documentation/spi/spidev_fdx.c | 158 -- Documentation/spi/spidev_test.c | 318 --- Documentation/stable_kernel_rules.txt | 2 +- Documentation/sysctl/fs.txt | 23 + Documentation/sysctl/kernel.txt | 38 +- Documentation/sysctl/vm.txt | 33 +- Documentation/thermal/sysfs-api.txt | 1 + Documentation/timers/hpet.txt | 4 +- Documentation/trace/events-msr.txt | 37 + Documentation/trace/postprocess/decode_msr.py | 37 + Documentation/ubsan.txt | 84 + Documentation/usb/chipidea.txt | 4 +- Documentation/usb/gadget-testing.txt | 4 +- Documentation/video4linux/API.html | 2 +- Documentation/video4linux/CARDLIST.em28xx | 4 +- Documentation/video4linux/fimc.txt | 6 +- Documentation/video4linux/omap4_camera.txt | 2 +- Documentation/video4linux/si4713.txt | 2 +- Documentation/video4linux/v4l2-framework.txt | 12 +- Documentation/video4linux/v4l2-pci-skeleton.c | 13 +- Documentation/virtual/kvm/api.txt | 43 +- Documentation/virtual/kvm/devices/vm.txt | 3 +- Documentation/virtual/kvm/mmu.txt | 7 +- Documentation/vm/slub.txt | 2 +- Documentation/vm/transhuge.txt | 151 +- Documentation/watchdog/watchdog-kernel-api.txt | 77 +- Documentation/watchdog/watchdog-parameters.txt | 4 + Documentation/zh_CN/video4linux/v4l2-framework.txt | 8 +- 437 files changed, 21199 insertions(+), 11355 deletions(-) create mode 100644 Documentation/ABI/testing/configfs-iio create mode 100644 Documentation/ABI/testing/configfs-rdma_cm create mode 100644 Documentation/ABI/testing/configfs-usb-gadget-tcm delete mode 100644 Documentation/ABI/testing/debugfs-aufs delete mode 100644 Documentation/ABI/testing/sysfs-aufs create mode 100644 Documentation/ABI/testing/sysfs-bus-iio-ina2xx-adc create mode 100644 Documentation/ABI/testing/sysfs-class-infiniband create mode 100644 Documentation/ABI/testing/sysfs-class-net-qmi create mode 100644 Documentation/ABI/testing/sysfs-class-watchdog create mode 100644 Documentation/DocBook/media/v4l/media-ioc-g-topology.xml create mode 100644 Documentation/DocBook/media/v4l/media-types.xml create mode 100644 Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png create mode 100644 Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg create mode 100644 Documentation/RCU/Design/Requirements/RCUApplicability.svg create mode 100644 Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg create mode 100644 Documentation/RCU/Design/Requirements/Requirements.html create mode 100644 Documentation/RCU/Design/Requirements/Requirements.htmlx create mode 100755 Documentation/RCU/Design/htmlqqz.sh create mode 100644 Documentation/arm64/silicon-errata.txt create mode 100644 Documentation/cgroup-v1/00-INDEX create mode 100644 Documentation/cgroup-v1/blkio-controller.txt create mode 100644 Documentation/cgroup-v1/cgroups.txt create mode 100644 Documentation/cgroup-v1/cpuacct.txt create mode 100644 Documentation/cgroup-v1/cpusets.txt create mode 100644 Documentation/cgroup-v1/devices.txt create mode 100644 Documentation/cgroup-v1/freezer-subsystem.txt create mode 100644 Documentation/cgroup-v1/hugetlb.txt create mode 100644 Documentation/cgroup-v1/memcg_test.txt create mode 100644 Documentation/cgroup-v1/memory.txt create mode 100644 Documentation/cgroup-v1/net_cls.txt create mode 100644 Documentation/cgroup-v1/net_prio.txt create mode 100644 Documentation/cgroup-v1/pids.txt create mode 100644 Documentation/cgroup-v2.txt delete mode 100644 Documentation/cgroups/00-INDEX delete mode 100644 Documentation/cgroups/blkio-controller.txt delete mode 100644 Documentation/cgroups/cgroups.txt delete mode 100644 Documentation/cgroups/cpuacct.txt delete mode 100644 Documentation/cgroups/cpusets.txt delete mode 100644 Documentation/cgroups/devices.txt delete mode 100644 Documentation/cgroups/freezer-subsystem.txt delete mode 100644 Documentation/cgroups/hugetlb.txt delete mode 100644 Documentation/cgroups/memcg_test.txt delete mode 100644 Documentation/cgroups/memory.txt delete mode 100644 Documentation/cgroups/net_cls.txt delete mode 100644 Documentation/cgroups/net_prio.txt delete mode 100644 Documentation/cgroups/pids.txt delete mode 100644 Documentation/cgroups/unified-hierarchy.txt create mode 100644 Documentation/devicetree/bindings/arm/bcm/brcm,nsp-cpu-method.txt create mode 100644 Documentation/devicetree/bindings/arm/compulab-boards.txt delete mode 100644 Documentation/devicetree/bindings/arm/exynos/smp-sysram.txt create mode 100644 Documentation/devicetree/bindings/arm/l2c2x0.txt delete mode 100644 Documentation/devicetree/bindings/arm/l2cc.txt delete mode 100644 Documentation/devicetree/bindings/arm/rockchip/pmu-sram.txt delete mode 100644 Documentation/devicetree/bindings/arm/rockchip/smp-sram.txt create mode 100644 Documentation/devicetree/bindings/arm/secure.txt create mode 100644 Documentation/devicetree/bindings/arm/technologic.txt create mode 100644 Documentation/devicetree/bindings/bus/uniphier-system-bus.txt create mode 100644 Documentation/devicetree/bindings/clock/arm-syscon-icst.txt create mode 100644 Documentation/devicetree/bindings/clock/brcm,bcm2835-aux-clock.txt create mode 100644 Documentation/devicetree/bindings/clock/cs2000-cp.txt create mode 100644 Documentation/devicetree/bindings/clock/dove-divider-clock.txt create mode 100644 Documentation/devicetree/bindings/clock/nvidia,tegra210-car.txt create mode 100644 Documentation/devicetree/bindings/clock/nxp,lpc3220-clk.txt create mode 100644 Documentation/devicetree/bindings/clock/nxp,lpc3220-usb-clk.txt create mode 100644 Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt create mode 100644 Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt create mode 100644 Documentation/devicetree/bindings/clock/samsung,s2mps11.txt create mode 100644 Documentation/devicetree/bindings/clock/tango4-clock.txt create mode 100644 Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt create mode 100644 Documentation/devicetree/bindings/crypto/rockchip-crypto.txt create mode 100644 Documentation/devicetree/bindings/display/etnaviv/etnaviv-drm.txt create mode 100644 Documentation/devicetree/bindings/display/panel/boe,tv080wum-nl0.txt create mode 100644 Documentation/devicetree/bindings/display/panel/innolux,g121x1-l03.txt create mode 100644 Documentation/devicetree/bindings/display/panel/kyo,tcg121xglp.txt create mode 100644 Documentation/devicetree/bindings/display/panel/panasonic,vvx10f034n00.txt create mode 100644 Documentation/devicetree/bindings/display/panel/qiaodian,qd43003c0-40.txt create mode 100644 Documentation/devicetree/bindings/display/panel/sharp,ls043t1le01.txt create mode 100644 Documentation/devicetree/bindings/display/panel/startek,startek-kd050c.txt create mode 100644 Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt create mode 100644 Documentation/devicetree/bindings/dma/stm32-dma.txt create mode 100644 Documentation/devicetree/bindings/extcon/extcon-max3355.txt create mode 100644 Documentation/devicetree/bindings/gpio/gpio-tps65086.txt create mode 100644 Documentation/devicetree/bindings/iio/adc/imx7d-adc.txt create mode 100644 Documentation/devicetree/bindings/iio/adc/palmas-gpadc.txt create mode 100644 Documentation/devicetree/bindings/iio/adc/ti-ads8688.txt create mode 100644 Documentation/devicetree/bindings/iio/health/max30100.txt delete mode 100644 Documentation/devicetree/bindings/interrupt-controller/allwinner,sun67i-sc-nmi.txt create mode 100644 Documentation/devicetree/bindings/interrupt-controller/allwinner,sunxi-nmi.txt create mode 100644 Documentation/devicetree/bindings/interrupt-controller/hisilicon,mbigen-v2.txt create mode 100644 Documentation/devicetree/bindings/interrupt-controller/microchip,pic32-evic.txt create mode 100644 Documentation/devicetree/bindings/interrupt-controller/technologic,ts4800.txt delete mode 100644 Documentation/devicetree/bindings/mfd/s2mpa01.txt delete mode 100644 Documentation/devicetree/bindings/mfd/s2mps11.txt create mode 100644 Documentation/devicetree/bindings/mfd/samsung,sec-core.txt create mode 100644 Documentation/devicetree/bindings/mips/pic32/microchip,pic32mzda.txt delete mode 100644 Documentation/devicetree/bindings/misc/sram.txt create mode 100644 Documentation/devicetree/bindings/mtd/ingenic,jz4780-nand.txt create mode 100644 Documentation/devicetree/bindings/mtd/mtk-quadspi.txt delete mode 100644 Documentation/devicetree/bindings/net/cdns-emac.txt create mode 100644 Documentation/devicetree/bindings/net/ieee802154/adf7242.txt create mode 100644 Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt create mode 100644 Documentation/devicetree/bindings/net/nfc/st95hf.txt create mode 100644 Documentation/devicetree/bindings/net/ralink,rt2880-net.txt create mode 100644 Documentation/devicetree/bindings/net/ralink,rt3050-esw.txt create mode 100644 Documentation/devicetree/bindings/pci/qcom,pcie.txt create mode 100644 Documentation/devicetree/bindings/phy/phy-ath79-usb.txt create mode 100644 Documentation/devicetree/bindings/phy/phy-hi6220-usb.txt create mode 100644 Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt delete mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,cygnus-gpio.txt create mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt create mode 100644 Documentation/devicetree/bindings/pinctrl/brcm,nsp-gpio.txt create mode 100644 Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt create mode 100644 Documentation/devicetree/bindings/pwm/pwm-omap-dmtimer.txt create mode 100644 Documentation/devicetree/bindings/regulator/lm363x-regulator.txt create mode 100644 Documentation/devicetree/bindings/regulator/pv88060.txt create mode 100644 Documentation/devicetree/bindings/regulator/pv88090.txt create mode 100644 Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt delete mode 100644 Documentation/devicetree/bindings/regulator/s5m8767-regulator.txt create mode 100644 Documentation/devicetree/bindings/regulator/samsung,s2mpa01.txt create mode 100644 Documentation/devicetree/bindings/regulator/samsung,s2mps11.txt create mode 100644 Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt create mode 100644 Documentation/devicetree/bindings/reset/hisilicon,hi6220-reset.txt create mode 100644 Documentation/devicetree/bindings/scsi/hisilicon-sas.txt create mode 100644 Documentation/devicetree/bindings/soc/bcm/raspberrypi,bcm2835-power.txt create mode 100644 Documentation/devicetree/bindings/soc/dove/pmu.txt delete mode 100644 Documentation/devicetree/bindings/soc/qcom,smd-rpm.txt create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,smd-rpm.txt create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,smp2p.txt create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,smsm.txt delete mode 100644 Documentation/devicetree/bindings/soc/sunxi/sram.txt create mode 100644 Documentation/devicetree/bindings/soc/ti/wkup_m3_ipc.txt create mode 100644 Documentation/devicetree/bindings/sound/atmel-pdmic.txt create mode 100644 Documentation/devicetree/bindings/sound/da7218.txt create mode 100644 Documentation/devicetree/bindings/sound/img,i2s-in.txt create mode 100644 Documentation/devicetree/bindings/sound/img,i2s-out.txt create mode 100644 Documentation/devicetree/bindings/sound/img,parallel-out.txt create mode 100644 Documentation/devicetree/bindings/sound/img,pistachio-internal-dac.txt create mode 100644 Documentation/devicetree/bindings/sound/img,spdif-in.txt create mode 100644 Documentation/devicetree/bindings/sound/img,spdif-out.txt create mode 100644 Documentation/devicetree/bindings/sound/inno-rk3036.txt delete mode 100644 Documentation/devicetree/bindings/sound/pcm1792a.txt create mode 100644 Documentation/devicetree/bindings/sound/pcm179x.txt create mode 100644 Documentation/devicetree/bindings/sound/rt5616.txt create mode 100644 Documentation/devicetree/bindings/sound/rt5651.txt create mode 100644 Documentation/devicetree/bindings/sound/rt5659.txt create mode 100644 Documentation/devicetree/bindings/sound/ti,pcm3168a.txt create mode 100644 Documentation/devicetree/bindings/sound/wlf,wm8974.txt create mode 100644 Documentation/devicetree/bindings/sram/rockchip-pmu-sram.txt create mode 100644 Documentation/devicetree/bindings/sram/rockchip-smp-sram.txt create mode 100644 Documentation/devicetree/bindings/sram/samsung-sram.txt create mode 100644 Documentation/devicetree/bindings/sram/sram.txt create mode 100644 Documentation/devicetree/bindings/sram/sunxi-sram.txt create mode 100644 Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt create mode 100644 Documentation/devicetree/bindings/thermal/qoriq-thermal.txt create mode 100644 Documentation/devicetree/bindings/usb/dwc3-xilinx.txt create mode 100644 Documentation/devicetree/bindings/usb/mt8173-xhci.txt create mode 100644 Documentation/devicetree/bindings/usb/octeon-usb.txt create mode 100644 Documentation/devicetree/bindings/usb/renesas_usb3.txt create mode 100644 Documentation/devicetree/bindings/watchdog/alphascale-asm9260.txt create mode 100644 Documentation/devicetree/bindings/watchdog/meson-wdt.txt delete mode 100644 Documentation/devicetree/bindings/watchdog/meson6-wdt.txt create mode 100644 Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt create mode 100644 Documentation/devicetree/bindings/watchdog/sigma,smp8642-wdt.txt create mode 100644 Documentation/devicetree/bindings/watchdog/sp805-wdt.txt create mode 100644 Documentation/devicetree/bindings/watchdog/ts4800-wdt.txt create mode 100644 Documentation/devicetree/bindings/watchdog/ziirave-wdt.txt delete mode 100644 Documentation/features/io/dma_map_attrs/arch-support.txt delete mode 100644 Documentation/features/vm/pmdp_splitting_flush/arch-support.txt delete mode 100644 Documentation/filesystems/aufs/README delete mode 100644 Documentation/filesystems/aufs/design/01intro.txt delete mode 100644 Documentation/filesystems/aufs/design/02struct.txt delete mode 100644 Documentation/filesystems/aufs/design/03atomic_open.txt delete mode 100644 Documentation/filesystems/aufs/design/03lookup.txt delete mode 100644 Documentation/filesystems/aufs/design/04branch.txt delete mode 100644 Documentation/filesystems/aufs/design/05wbr_policy.txt delete mode 100644 Documentation/filesystems/aufs/design/06fhsm.txt delete mode 100644 Documentation/filesystems/aufs/design/06mmap.txt delete mode 100644 Documentation/filesystems/aufs/design/06xattr.txt delete mode 100644 Documentation/filesystems/aufs/design/07export.txt delete mode 100644 Documentation/filesystems/aufs/design/08shwh.txt delete mode 100644 Documentation/filesystems/aufs/design/10dynop.txt delete mode 100644 Documentation/hwmon/htu21 create mode 100644 Documentation/hwmon/ltc3815 create mode 100644 Documentation/iio/iio_configfs.txt delete mode 100644 Documentation/media-framework.txt delete mode 100644 Documentation/power/tuxonice-internals.txt delete mode 100644 Documentation/power/tuxonice.txt create mode 100644 Documentation/sound/alsa/img,spdif-in.txt delete mode 100644 Documentation/spi/.gitignore delete mode 100644 Documentation/spi/Makefile delete mode 100644 Documentation/spi/spidev_fdx.c delete mode 100644 Documentation/spi/spidev_test.c create mode 100644 Documentation/trace/events-msr.txt create mode 100644 Documentation/trace/postprocess/decode_msr.py create mode 100644 Documentation/ubsan.txt (limited to 'Documentation') diff --git a/Documentation/ABI/testing/configfs-iio b/Documentation/ABI/testing/configfs-iio new file mode 100644 index 000000000..2483756fc --- /dev/null +++ b/Documentation/ABI/testing/configfs-iio @@ -0,0 +1,21 @@ +What: /config/iio +Date: October 2015 +KernelVersion: 4.4 +Contact: linux-iio@vger.kernel.org +Description: + This represents Industrial IO configuration entry point + directory. It contains sub-groups corresponding to IIO + objects. + +What: /config/iio/triggers +Date: October 2015 +KernelVersion: 4.4 +Description: + Industrial IO software triggers directory. + +What: /config/iio/triggers/hrtimers +Date: October 2015 +KernelVersion: 4.4 +Description: + High resolution timers directory. Creating a directory here + will result in creating a hrtimer trigger in the IIO subsystem. diff --git a/Documentation/ABI/testing/configfs-rdma_cm b/Documentation/ABI/testing/configfs-rdma_cm new file mode 100644 index 000000000..5c389aaf5 --- /dev/null +++ b/Documentation/ABI/testing/configfs-rdma_cm @@ -0,0 +1,22 @@ +What: /config/rdma_cm +Date: November 29, 2015 +KernelVersion: 4.4.0 +Description: Interface is used to configure RDMA-cable HCAs in respect to + RDMA-CM attributes. + + Attributes are visible only when configfs is mounted. To mount + configfs in /config directory use: + # mount -t configfs none /config/ + + In order to set parameters related to a specific HCA, a directory + for this HCA has to be created: + mkdir -p /config/rdma_cm/ + + +What: /config/rdma_cm//ports//default_roce_mode +Date: November 29, 2015 +KernelVersion: 4.4.0 +Description: RDMA-CM based connections from HCA at port + will be initiated with this RoCE type as default. + The possible RoCE types are either "IB/RoCE v1" or "RoCE v2". + This parameter has RW access. diff --git a/Documentation/ABI/testing/configfs-usb-gadget-sourcesink b/Documentation/ABI/testing/configfs-usb-gadget-sourcesink index bc7ff731a..f56335af2 100644 --- a/Documentation/ABI/testing/configfs-usb-gadget-sourcesink +++ b/Documentation/ABI/testing/configfs-usb-gadget-sourcesink @@ -10,3 +10,5 @@ Description: isoc_mult - 0..2 (hs/ss only) isoc_maxburst - 0..15 (ss only) buflen - buffer length + bulk_qlen - depth of queue for bulk + iso_qlen - depth of queue for iso diff --git a/Documentation/ABI/testing/configfs-usb-gadget-tcm b/Documentation/ABI/testing/configfs-usb-gadget-tcm new file mode 100644 index 000000000..a29ed2dd6 --- /dev/null +++ b/Documentation/ABI/testing/configfs-usb-gadget-tcm @@ -0,0 +1,6 @@ +What: /config/usb-gadget/gadget/functions/tcm.name +Date: Dec 2015 +KernelVersion: 4.5 +Description: + There are no attributes because all the configuration + is performed in the "target" subsystem of configfs. diff --git a/Documentation/ABI/testing/debugfs-aufs b/Documentation/ABI/testing/debugfs-aufs deleted file mode 100644 index 99642d105..000000000 --- a/Documentation/ABI/testing/debugfs-aufs +++ /dev/null @@ -1,50 +0,0 @@ -What: /debug/aufs/si_/ -Date: March 2009 -Contact: J. R. Okajima -Description: - Under /debug/aufs, a directory named si_ is created - per aufs mount, where is a unique id generated - internally. - -What: /debug/aufs/si_/plink -Date: Apr 2013 -Contact: J. R. Okajima -Description: - It has three lines and shows the information about the - pseudo-link. The first line is a single number - representing a number of buckets. The second line is a - number of pseudo-links per buckets (separated by a - blank). The last line is a single number representing a - total number of psedo-links. - When the aufs mount option 'noplink' is specified, it - will show "1\n0\n0\n". - -What: /debug/aufs/si_/xib -Date: March 2009 -Contact: J. R. Okajima -Description: - It shows the consumed blocks by xib (External Inode Number - Bitmap), its block size and file size. - When the aufs mount option 'noxino' is specified, it - will be empty. About XINO files, see the aufs manual. - -What: /debug/aufs/si_/xino0, xino1 ... xinoN -Date: March 2009 -Contact: J. R. Okajima -Description: - It shows the consumed blocks by xino (External Inode Number - Translation Table), its link count, block size and file - size. - When the aufs mount option 'noxino' is specified, it - will be empty. About XINO files, see the aufs manual. - -What: /debug/aufs/si_/xigen -Date: March 2009 -Contact: J. R. Okajima -Description: - It shows the consumed blocks by xigen (External Inode - Generation Table), its block size and file size. - If CONFIG_AUFS_EXPORT is disabled, this entry will not - be created. - When the aufs mount option 'noxino' is specified, it - will be empty. About XINO files, see the aufs manual. diff --git a/Documentation/ABI/testing/sysfs-aufs b/Documentation/ABI/testing/sysfs-aufs deleted file mode 100644 index 82f951849..000000000 --- a/Documentation/ABI/testing/sysfs-aufs +++ /dev/null @@ -1,31 +0,0 @@ -What: /sys/fs/aufs/si_/ -Date: March 2009 -Contact: J. R. Okajima -Description: - Under /sys/fs/aufs, a directory named si_ is created - per aufs mount, where is a unique id generated - internally. - -What: /sys/fs/aufs/si_/br0, br1 ... brN -Date: March 2009 -Contact: J. R. Okajima -Description: - It shows the abolute path of a member directory (which - is called branch) in aufs, and its permission. - -What: /sys/fs/aufs/si_/brid0, brid1 ... bridN -Date: July 2013 -Contact: J. R. Okajima -Description: - It shows the id of a member directory (which is called - branch) in aufs. - -What: /sys/fs/aufs/si_/xi_path -Date: March 2009 -Contact: J. R. Okajima -Description: - It shows the abolute path of XINO (External Inode Number - Bitmap, Translation Table and Generation Table) file - even if it is the default path. - When the aufs mount option 'noxino' is specified, it - will be empty. About XINO files, see the aufs manual. diff --git a/Documentation/ABI/testing/sysfs-bus-iio-ina2xx-adc b/Documentation/ABI/testing/sysfs-bus-iio-ina2xx-adc new file mode 100644 index 000000000..8916f7ec6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-iio-ina2xx-adc @@ -0,0 +1,24 @@ +What: /sys/bus/iio/devices/iio:deviceX/in_allow_async_readout +Date: December 2015 +KernelVersion: 4.4 +Contact: linux-iio@vger.kernel.org +Description: + By default (value '0'), the capture thread checks for the Conversion + Ready Flag to being set prior to committing a new value to the sample + buffer. This synchronizes the in-chip conversion rate with the + in-driver readout rate at the cost of an additional register read. + + Writing '1' will remove the polling for the Conversion Ready Flags to + save the additional i2c transaction, which will improve the bandwidth + available for reading data. However, samples can be occasionally skipped + or repeated, depending on the beat between the capture and conversion + rates. + +What: /sys/bus/iio/devices/iio:deviceX/in_shunt_resistor +Date: December 2015 +KernelVersion: 4.4 +Contact: linux-iio@vger.kernel.org +Description: + The value of the shunt resistor may be known only at runtime fom an + eeprom content read by a client application. This attribute allows to + set its value in ohms. diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb index 136ba17d2..0bd731cbb 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb +++ b/Documentation/ABI/testing/sysfs-bus-usb @@ -189,6 +189,17 @@ Description: The file will read "hotplug", "wired" and "not used" if the information is available, and "unknown" otherwise. +What: /sys/bus/usb/devices/.../(hub interface)/portX/usb3_lpm_permit +Date: November 2015 +Contact: Lu Baolu +Description: + Some USB3.0 devices are not friendly to USB3 LPM. usb3_lpm_permit + attribute allows enabling/disabling usb3 lpm of a port. It takes + effect both before and after a usb device is enumerated. Supported + values are "0" if both u1 and u2 are NOT permitted, "u1" if only u1 + is permitted, "u2" if only u2 is permitted, "u1_u2" if both u1 and + u2 are permitted. + What: /sys/bus/usb/devices/.../power/usb2_lpm_l1_timeout Date: May 2013 Contact: Mathias Nyman diff --git a/Documentation/ABI/testing/sysfs-class-infiniband b/Documentation/ABI/testing/sysfs-class-infiniband new file mode 100644 index 000000000..a86abe66a --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-infiniband @@ -0,0 +1,16 @@ +What: /sys/class/infiniband//ports//gid_attrs/ndevs/ +Date: November 29, 2015 +KernelVersion: 4.4.0 +Contact: linux-rdma@vger.kernel.org +Description: The net-device's name associated with the GID resides + at index . + +What: /sys/class/infiniband//ports//gid_attrs/types/ +Date: November 29, 2015 +KernelVersion: 4.4.0 +Contact: linux-rdma@vger.kernel.org +Description: The RoCE type of the associated GID resides at index . + This could either be "IB/RoCE v1" for IB and RoCE v1 based GODs + or "RoCE v2" for RoCE v2 based GIDs. + + diff --git a/Documentation/ABI/testing/sysfs-class-net-cdc_ncm b/Documentation/ABI/testing/sysfs-class-net-cdc_ncm index 5cedf72df..f7be0e88b 100644 --- a/Documentation/ABI/testing/sysfs-class-net-cdc_ncm +++ b/Documentation/ABI/testing/sysfs-class-net-cdc_ncm @@ -19,6 +19,25 @@ Description: Set to 0 to pad all frames. Set greater than tx_max to disable all padding. +What: /sys/class/net//cdc_ncm/ndp_to_end +Date: Dec 2015 +KernelVersion: 4.5 +Contact: Bjørn Mork +Description: + Boolean attribute showing the status of the "NDP to + end" quirk. Defaults to 'N', except for devices + already known to need it enabled. + + The "NDP to end" quirk makes the driver place the NDP + (the packet index table) after the payload. The NCM + specification does not mandate this, but some devices + are known to be more restrictive. Write 'Y' to this + attribute for temporary testing of a suspect device + failing to work with the default driver settings. + + A device entry should be added to the driver if this + quirk is found to be required. + What: /sys/class/net//cdc_ncm/rx_max Date: May 2014 KernelVersion: 3.16 diff --git a/Documentation/ABI/testing/sysfs-class-net-mesh b/Documentation/ABI/testing/sysfs-class-net-mesh index c46406296..c2b956d44 100644 --- a/Documentation/ABI/testing/sysfs-class-net-mesh +++ b/Documentation/ABI/testing/sysfs-class-net-mesh @@ -8,7 +8,7 @@ Description: What: /sys/class/net//mesh//ap_isolation Date: May 2011 -Contact: Antonio Quartulli +Contact: Antonio Quartulli Description: Indicates whether the data traffic going from a wireless client to another wireless client will be @@ -70,7 +70,7 @@ Description: What: /sys/class/net//mesh/isolation_mark Date: Nov 2013 -Contact: Antonio Quartulli +Contact: Antonio Quartulli Description: Defines the isolation mark (and its bitmask) which is used to classify clients as "isolated" by the diff --git a/Documentation/ABI/testing/sysfs-class-net-qmi b/Documentation/ABI/testing/sysfs-class-net-qmi new file mode 100644 index 000000000..fa5a00bb1 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-net-qmi @@ -0,0 +1,23 @@ +What: /sys/class/net//qmi/raw_ip +Date: Dec 2015 +KernelVersion: 4.4 +Contact: Bjørn Mork +Description: + Boolean. Default: 'N' + + Set this to 'Y' to change the network device link + framing from '802.3' to 'raw-ip'. + + The netdev will change to reflect the link framing + mode. The netdev is an ordinary ethernet device in + '802.3' mode, and the driver expects to exchange + frames with an ethernet header over the USB link. The + netdev is a headerless p-t-p device in 'raw-ip' mode, + and the driver expects to echange IPv4 or IPv6 packets + without any L2 header over the USB link. + + Userspace is in full control of firmware configuration + through the delegation of the QMI protocol. Userspace + is responsible for coordination of driver and firmware + link framing mode, changing this setting to 'Y' if the + firmware is configured for 'raw-ip' mode. diff --git a/Documentation/ABI/testing/sysfs-class-watchdog b/Documentation/ABI/testing/sysfs-class-watchdog new file mode 100644 index 000000000..736046b33 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-watchdog @@ -0,0 +1,51 @@ +What: /sys/class/watchdog/watchdogn/bootstatus +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. It contains status of the watchdog + device at boot. It is equivalent to WDIOC_GETBOOTSTATUS of + ioctl interface. + +What: /sys/class/watchdog/watchdogn/identity +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. It contains identity string of + watchdog device. + +What: /sys/class/watchdog/watchdogn/nowayout +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. While reading, it gives '1' if that + device supports nowayout feature else, it gives '0'. + +What: /sys/class/watchdog/watchdogn/state +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. It gives active/inactive status of + watchdog device. + +What: /sys/class/watchdog/watchdogn/status +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. It contains watchdog device's + internal status bits. It is equivalent to WDIOC_GETSTATUS + of ioctl interface. + +What: /sys/class/watchdog/watchdogn/timeleft +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. It contains value of time left for + reset generation. It is equivalent to WDIOC_GETTIMELEFT of + ioctl interface. + +What: /sys/class/watchdog/watchdogn/timeout +Date: August 2015 +Contact: Wim Van Sebroeck +Description: + It is a read only file. It is read to know about current + value of timeout programmed. diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c..e5200f354 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -87,6 +87,12 @@ Contact: "Jaegeuk Kim" Description: Controls the checkpoint timing. +What: /sys/fs/f2fs//idle_interval +Date: January 2016 +Contact: "Jaegeuk Kim" +Description: + Controls the idle timing. + What: /sys/fs/f2fs//ra_nid_pages Date: October 2015 Contact: "Chao Yu" diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch index 5bf42a840..da87f43ae 100644 --- a/Documentation/ABI/testing/sysfs-kernel-livepatch +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -33,7 +33,7 @@ Description: The object directory contains subdirectories for each function that is patched within the object. -What: /sys/kernel/livepatch/// +What: /sys/kernel/livepatch/// Date: Nov 2014 KernelVersion: 3.19.0 Contact: live-patching@vger.kernel.org @@ -41,4 +41,8 @@ Description: The function directory contains attributes regarding the properties and state of the patched function. + The directory name contains the patched function name and a + sympos number corresponding to the nth occurrence of the symbol + name in kallsyms for the patched object. + There are currently no such attributes. diff --git a/Documentation/ABI/testing/sysfs-ptp b/Documentation/ABI/testing/sysfs-ptp index 44806a678..a17f817a9 100644 --- a/Documentation/ABI/testing/sysfs-ptp +++ b/Documentation/ABI/testing/sysfs-ptp @@ -74,7 +74,7 @@ Description: assignment may be changed by two writing numbers into the file. -What: /sys/class/ptp/ptpN/pps_avaiable +What: /sys/class/ptp/ptpN/pps_available Date: September 2010 Contact: Richard Cochran Description: diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index c06f817b3..db653774c 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle @@ -430,7 +430,7 @@ The rationale for using gotos is: return result; } -A common type of bug to be aware of it "one err bugs" which look like this: +A common type of bug to be aware of is "one err bugs" which look like this: err: kfree(foo->bar); diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt index d69b3fc64..781024ef9 100644 --- a/Documentation/DMA-API-HOWTO.txt +++ b/Documentation/DMA-API-HOWTO.txt @@ -951,16 +951,6 @@ to "Closing". alignment constraints (e.g. the alignment constraints about 64-bit objects). -3) Supporting multiple types of IOMMUs - - If your architecture needs to support multiple types of IOMMUs, you - can use include/linux/asm-generic/dma-mapping-common.h. It's a - library to support the DMA API with multiple types of IOMMUs. Lots - of architectures (x86, powerpc, sh, alpha, ia64, microblaze and - sparc) use it. Choose one to see how it can be used. If you need to - support multiple types of IOMMUs in a single system, the example of - x86 or powerpc helps. - Closing This document, and the API itself, would not be in its current diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 1e98a7e6b..45ef3f279 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -236,7 +236,7 @@ are guaranteed also to be cache line boundaries). DMA_TO_DEVICE synchronisation must be done after the last modification of the memory region by the software and before it is handed off to -the driver. Once this primitive is used, memory covered by this +the device. Once this primitive is used, memory covered by this primitive should be treated as read-only by the device. If the device may write to it at any point, it should be DMA_BIDIRECTIONAL (see below). diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 91f6d89bb..d70f9b681 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -50,8 +50,7 @@ pdfdocs: $(PDF) HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) htmldocs: $(HTML) - $(call build_main_index) - $(call build_images) + $(call cmd,build_main_index) $(call install_media_images) MAN := $(patsubst %.xml, %.9, $(BOOKS)) @@ -139,7 +138,8 @@ quiet_cmd_db2pdf = PDF $@ index = index.html main_idx = $(obj)/$(index) -build_main_index = rm -rf $(main_idx); \ +quiet_cmd_build_main_index = HTML $(main_idx) + cmd_build_main_index = rm -rf $(main_idx); \ echo '

Linux Kernel HTML Documentation

' >> $(main_idx) && \ echo '

Kernel Version: $(KERNELVERSION)

' >> $(main_idx) && \ cat $(HTML) >> $(main_idx) @@ -227,6 +227,10 @@ dochelp: @echo ' mandocs - man pages' @echo ' installmandocs - install man pages generated by mandocs' @echo ' cleandocs - clean all generated DocBook files' + @echo + @echo 'make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml' + @echo ' valid values for DOCBOOKS are: $(DOCBOOKS)' + ### # Temporary files left by various tools diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index 42a2d8593..cdd8b24db 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -238,83 +238,32 @@ X!Isound/sound_firmware.c !Iinclude/media/videobuf2-memops.h Digital TV (DVB) devices -!Idrivers/media/dvb-core/dvb_ca_en50221.h -!Idrivers/media/dvb-core/dvb_frontend.h + Digital TV Common functions !Idrivers/media/dvb-core/dvb_math.h !Idrivers/media/dvb-core/dvb_ringbuffer.h !Idrivers/media/dvb-core/dvbdev.h - Digital TV Demux API - The kernel demux API defines a driver-internal interface for - registering low-level, hardware specific driver to a hardware - independent demux layer. It is only of interest for Digital TV - device driver writers. The header file for this API is named - demux.h and located in - drivers/media/dvb-core. - - The demux API should be implemented for each demux in the - system. It is used to select the TS source of a demux and to manage - the demux resources. When the demux client allocates a resource via - the demux API, it receives a pointer to the API of that - resource. - Each demux receives its TS input from a DVB front-end or from - memory, as set via this demux API. In a system with more than one - front-end, the API can be used to select one of the DVB front-ends - as a TS source for a demux, unless this is fixed in the HW platform. - The demux API only controls front-ends regarding to their connections - with demuxes; the APIs used to set the other front-end parameters, - such as tuning, are not defined in this document. - The functions that implement the abstract interface demux should - be defined static or module private and registered to the Demux - core for external access. It is not necessary to implement every - function in the struct dmx_demux. For example, - a demux interface might support Section filtering, but not PES - filtering. The API client is expected to check the value of any - function pointer before calling the function: the value of NULL means - that the “function is not available”. - Whenever the functions of the demux API modify shared data, - the possibilities of lost update and race condition problems should - be addressed, e.g. by protecting parts of code with mutexes. - Note that functions called from a bottom half context must not - sleep. Even a simple memory allocation without using GFP_ATOMIC can - result in a kernel thread being put to sleep if swapping is needed. - For example, the Linux kernel calls the functions of a network device - interface from a bottom half context. Thus, if a demux API function - is called from network device code, the function must not sleep. - - - -
- Demux Callback API - This kernel-space API comprises the callback functions that - deliver filtered data to the demux client. Unlike the other DVB - kABIs, these functions are provided by the client and called from - the demux code. - The function pointers of this abstract interface are not - packed into a structure as in the other demux APIs, because the - callback functions are registered and used independent of each - other. As an example, it is possible for the API client to provide - several callback functions for receiving TS packets and no - callbacks for PES packets or sections. - The functions that implement the callback API need not be - re-entrant: when a demux driver calls one of these functions, - the driver is not allowed to call the function again before - the original call returns. If a callback is triggered by a - hardware interrupt, it is recommended to use the Linux - “bottom half” mechanism or start a tasklet instead of - making the callback function call directly from a hardware - interrupt. - This mechanism is implemented by - dmx_ts_cb() and - dmx_section_cb(). -
- +
+ Digital TV Frontend kABI +!Pdrivers/media/dvb-core/dvb_frontend.h Digital TV Frontend +!Idrivers/media/dvb-core/dvb_frontend.h + + Digital TV Demux kABI +!Pdrivers/media/dvb-core/demux.h Digital TV Demux + Demux Callback API +!Pdrivers/media/dvb-core/demux.h Demux Callback + !Idrivers/media/dvb-core/demux.h - +
+ Digital TV Conditional Access kABI +!Idrivers/media/dvb-core/dvb_ca_en50221.h + + Remote Controller devices !Iinclude/media/rc-core.h !Iinclude/media/lirc_dev.h Media Controller devices +!Pinclude/media/media-device.h Media Controller !Iinclude/media/media-device.h !Iinclude/media/media-devnode.h !Iinclude/media/media-entity.h diff --git a/Documentation/DocBook/gpu.tmpl b/Documentation/DocBook/gpu.tmpl index 201dcd3c2..a8669330b 100644 --- a/Documentation/DocBook/gpu.tmpl +++ b/Documentation/DocBook/gpu.tmpl @@ -124,6 +124,43 @@ [Insert diagram of typical DRM stack here] + + Style Guidelines + + For consistency this documentation uses American English. Abbreviations + are written as all-uppercase, for example: DRM, KMS, IOCTL, CRTC, and so + on. To aid in reading, documentations make full use of the markup + characters kerneldoc provides: @parameter for function parameters, @member + for structure members, &structure to reference structures and + function() for functions. These all get automatically hyperlinked if + kerneldoc for the referenced objects exists. When referencing entries in + function vtables please use ->vfunc(). Note that kerneldoc does + not support referencing struct members directly, so please add a reference + to the vtable struct somewhere in the same paragraph or at least section. + + + Except in special situations (to separate locked from unlocked variants) + locking requirements for functions aren't documented in the kerneldoc. + Instead locking should be check at runtime using e.g. + WARN_ON(!mutex_is_locked(...));. Since it's much easier to + ignore documentation than runtime noise this provides more value. And on + top of that runtime checks do need to be updated when the locking rules + change, increasing the chances that they're correct. Within the + documentation the locking rules should be explained in the relevant + structures: Either in the comment for the lock explaining what it + protects, or data fields need a note about which lock protects them, or + both. + + + Functions which have a non-void return value should have a + section called "Returns" explaining the expected return values in + different cases and their meanings. Currently there's no consensus whether + that section name should be all upper-case or not, and whether it should + end in a colon or not. Go with the file-local style. Other common section + names are "Notes" with information for dangerous or tricky corner cases, + and "FIXME" where the interface could be cleaned up. + + @@ -615,18 +652,6 @@ char *date; drm_gem_object_init. Storage for private GEM objects must be managed by drivers. - - Drivers that do not need to extend GEM objects with private information - can call the drm_gem_object_alloc function to - allocate and initialize a struct drm_gem_object - instance. The GEM core will call the optional driver - gem_init_object operation after initializing - the GEM object with drm_gem_object_init. - int (*gem_init_object) (struct drm_gem_object *obj); - - - No alloc-and-init function exists for private GEM objects. - GEM Objects Lifetime @@ -635,10 +660,10 @@ char *date; acquired and release by calling drm_gem_object_reference and drm_gem_object_unreference respectively. The caller must hold the drm_device - struct_mutex lock. As a convenience, GEM - provides the drm_gem_object_reference_unlocked and - drm_gem_object_unreference_unlocked functions that - can be called without holding the lock. + struct_mutex lock when calling + drm_gem_object_reference. As a convenience, GEM + provides drm_gem_object_unreference_unlocked + functions that can be called without holding the lock. When the last reference to a GEM object is released the GEM core calls @@ -649,15 +674,9 @@ char *date; void (*gem_free_object) (struct drm_gem_object *obj); - Drivers are responsible for freeing all GEM object resources, including - the resources created by the GEM core. If an mmap offset has been - created for the object (in which case - drm_gem_object::map_list::map - is not NULL) it must be freed by a call to - drm_gem_free_mmap_offset. The shmfs backing store - must be released by calling drm_gem_object_release - (that function can safely be called if no shmfs backing store has been - created). + Drivers are responsible for freeing all GEM object resources. This includes + the resources created by the GEM core, which need to be released with + drm_gem_object_release. @@ -740,17 +759,10 @@ char *date; DRM identifies the GEM object to be mapped by a fake offset passed through the mmap offset argument. Prior to being mapped, a GEM object must thus be associated with a fake offset. To do so, drivers must call - drm_gem_create_mmap_offset on the object. The - function allocates a fake offset range from a pool and stores the - offset divided by PAGE_SIZE in - obj->map_list.hash.key. Care must be taken not to - call drm_gem_create_mmap_offset if a fake offset - has already been allocated for the object. This can be tested by - obj->map_list.map being non-NULL. + drm_gem_create_mmap_offset on the object. Once allocated, the fake offset value - (obj->map_list.hash.key << PAGE_SHIFT) must be passed to the application in a driver-specific way and can then be used as the mmap offset argument. @@ -836,10 +848,11 @@ char *date; abstracted from the client in libdrm. - - GEM Function Reference + + + GEM Function Reference !Edrivers/gpu/drm/drm_gem.c - +!Iinclude/drm/drm_gem.h VMA Offset Manager @@ -970,12 +983,10 @@ int max_width, max_height; Atomic Mode Setting Function Reference !Edrivers/gpu/drm/drm_atomic.c +!Idrivers/gpu/drm/drm_atomic.c - Frame Buffer Creation - struct drm_framebuffer *(*fb_create)(struct drm_device *dev, - struct drm_file *file_priv, - struct drm_mode_fb_cmd2 *mode_cmd); + Frame Buffer Abstraction Frame buffers are abstract memory objects that provide a source of pixels to scanout to a CRTC. Applications explicitly request the @@ -993,73 +1004,6 @@ int max_width, max_height; handles, e.g. vmwgfx directly exposes special TTM handles to userspace and so expects TTM handles in the create ioctl and not GEM handles. - - Drivers must first validate the requested frame buffer parameters passed - through the mode_cmd argument. In particular this is where invalid - sizes, pixel formats or pitches can be caught. - - - If the parameters are deemed valid, drivers then create, initialize and - return an instance of struct drm_framebuffer. - If desired the instance can be embedded in a larger driver-specific - structure. Drivers must fill its width, - height, pitches, - offsets, depth, - bits_per_pixel and - pixel_format fields from the values passed - through the drm_mode_fb_cmd2 argument. They - should call the drm_helper_mode_fill_fb_struct - helper function to do so. - - - - The initialization of the new framebuffer instance is finalized with a - call to drm_framebuffer_init which takes a pointer - to DRM frame buffer operations (struct - drm_framebuffer_funcs). Note that this function - publishes the framebuffer and so from this point on it can be accessed - concurrently from other threads. Hence it must be the last step in the - driver's framebuffer initialization sequence. Frame buffer operations - are - - - int (*create_handle)(struct drm_framebuffer *fb, - struct drm_file *file_priv, unsigned int *handle); - - Create a handle to the frame buffer underlying memory object. If - the frame buffer uses a multi-plane format, the handle will - reference the memory object associated with the first plane. - - - Drivers call drm_gem_handle_create to create - the handle. - - - - void (*destroy)(struct drm_framebuffer *framebuffer); - - Destroy the frame buffer object and frees all associated - resources. Drivers must call - drm_framebuffer_cleanup to free resources - allocated by the DRM core for the frame buffer object, and must - make sure to unreference all memory objects associated with the - frame buffer. Handles created by the - create_handle operation are released by - the DRM core. - - - - int (*dirty)(struct drm_framebuffer *framebuffer, - struct drm_file *file_priv, unsigned flags, unsigned color, - struct drm_clip_rect *clips, unsigned num_clips); - - This optional operation notifies the driver that a region of the - frame buffer has changed in response to a DRM_IOCTL_MODE_DIRTYFB - ioctl call. - - - - The lifetime of a drm framebuffer is controlled with a reference count, drivers can grab additional references with @@ -1197,137 +1141,6 @@ int max_width, max_height; pointer to CRTC functions. - - CRTC Operations - - Set Configuration - int (*set_config)(struct drm_mode_set *set); - - Apply a new CRTC configuration to the device. The configuration - specifies a CRTC, a frame buffer to scan out from, a (x,y) position in - the frame buffer, a display mode and an array of connectors to drive - with the CRTC if possible. - - - If the frame buffer specified in the configuration is NULL, the driver - must detach all encoders connected to the CRTC and all connectors - attached to those encoders and disable them. - - - This operation is called with the mode config lock held. - - - Note that the drm core has no notion of restoring the mode setting - state after resume, since all resume handling is in the full - responsibility of the driver. The common mode setting helper library - though provides a helper which can be used for this: - drm_helper_resume_force_mode. - - - - Page Flipping - int (*page_flip)(struct drm_crtc *crtc, struct drm_framebuffer *fb, - struct drm_pending_vblank_event *event); - - Schedule a page flip to the given frame buffer for the CRTC. This - operation is called with the mode config mutex held. - - - Page flipping is a synchronization mechanism that replaces the frame - buffer being scanned out by the CRTC with a new frame buffer during - vertical blanking, avoiding tearing. When an application requests a page - flip the DRM core verifies that the new frame buffer is large enough to - be scanned out by the CRTC in the currently configured mode and then - calls the CRTC page_flip operation with a - pointer to the new frame buffer. - - - The page_flip operation schedules a page flip. - Once any pending rendering targeting the new frame buffer has - completed, the CRTC will be reprogrammed to display that frame buffer - after the next vertical refresh. The operation must return immediately - without waiting for rendering or page flip to complete and must block - any new rendering to the frame buffer until the page flip completes. - - - If a page flip can be successfully scheduled the driver must set the - drm_crtc->fb field to the new framebuffer pointed to - by fb. This is important so that the reference counting - on framebuffers stays balanced. - - - If a page flip is already pending, the - page_flip operation must return - -EBUSY. - - - To synchronize page flip to vertical blanking the driver will likely - need to enable vertical blanking interrupts. It should call - drm_vblank_get for that purpose, and call - drm_vblank_put after the page flip completes. - - - If the application has requested to be notified when page flip completes - the page_flip operation will be called with a - non-NULL event argument pointing to a - drm_pending_vblank_event instance. Upon page - flip completion the driver must call drm_send_vblank_event - to fill in the event and send to wake up any waiting processes. - This can be performed with - event_lock, flags); - ... - drm_send_vblank_event(dev, pipe, event); - spin_unlock_irqrestore(&dev->event_lock, flags); - ]]> - - - FIXME: Could drivers that don't need to wait for rendering to complete - just add the event to dev->vblank_event_list and - let the DRM core handle everything, as for "normal" vertical blanking - events? - - - While waiting for the page flip to complete, the - event->base.link list head can be used freely by - the driver to store the pending event in a driver-specific list. - - - If the file handle is closed before the event is signaled, drivers must - take care to destroy the event in their - preclose operation (and, if needed, call - drm_vblank_put). - - - - Miscellaneous - - - void (*set_property)(struct drm_crtc *crtc, - struct drm_property *property, uint64_t value); - - Set the value of the given CRTC property to - value. See - for more information about properties. - - - - void (*gamma_set)(struct drm_crtc *crtc, u16 *r, u16 *g, u16 *b, - uint32_t start, uint32_t size); - - Apply a gamma table to the device. The operation is optional. - - - - void (*destroy)(struct drm_crtc *crtc); - - Destroy the CRTC when not needed anymore. See - . - - - - - Planes (struct <structname>drm_plane</structname>) @@ -1344,7 +1157,7 @@ int max_width, max_height; DRM_PLANE_TYPE_PRIMARY represents a "main" plane for a CRTC. Primary planes are the planes operated upon by CRTC modesetting and flipping - operations described in . + operations described in the page_flip hook in drm_crtc_funcs. DRM_PLANE_TYPE_CURSOR represents a "cursor" plane for a CRTC. Cursor @@ -1381,52 +1194,6 @@ int max_width, max_height; primary plane with standard capabilities. - - Plane Operations - - - int (*update_plane)(struct drm_plane *plane, struct drm_crtc *crtc, - struct drm_framebuffer *fb, int crtc_x, int crtc_y, - unsigned int crtc_w, unsigned int crtc_h, - uint32_t src_x, uint32_t src_y, - uint32_t src_w, uint32_t src_h); - - Enable and configure the plane to use the given CRTC and frame buffer. - - - The source rectangle in frame buffer memory coordinates is given by - the src_x, src_y, - src_w and src_h - parameters (as 16.16 fixed point values). Devices that don't support - subpixel plane coordinates can ignore the fractional part. - - - The destination rectangle in CRTC coordinates is given by the - crtc_x, crtc_y, - crtc_w and crtc_h - parameters (as integer values). Devices scale the source rectangle to - the destination rectangle. If scaling is not supported, and the source - rectangle size doesn't match the destination rectangle size, the - driver must return a -EINVAL error. - - - - int (*disable_plane)(struct drm_plane *plane); - - Disable the plane. The DRM core calls this method in response to a - DRM_IOCTL_MODE_SETPLANE ioctl call with the frame buffer ID set to 0. - Disabled planes must not be processed by the CRTC. - - - - void (*destroy)(struct drm_plane *plane); - - Destroy the plane when not needed anymore. See - . - - - - Encoders (struct <structname>drm_encoder</structname>) @@ -1483,27 +1250,6 @@ int max_width, max_height; encoders they want to use to a CRTC. - - Encoder Operations - - - void (*destroy)(struct drm_encoder *encoder); - - Called to destroy the encoder when not needed anymore. See - . - - - - void (*set_property)(struct drm_plane *plane, - struct drm_property *property, uint64_t value); - - Set the value of the given plane property to - value. See - for more information about properties. - - - - Connectors (struct <structname>drm_connector</structname>) @@ -1707,27 +1453,6 @@ int max_width, max_height; connector_status_unknown. - - Miscellaneous - - - void (*set_property)(struct drm_connector *connector, - struct drm_property *property, uint64_t value); - - Set the value of the given connector property to - value. See - for more information about properties. - - - - void (*destroy)(struct drm_connector *connector); - - Destroy the connector when not needed anymore. See - . - - - - @@ -1853,462 +1578,6 @@ void intel_crt_init(struct drm_device *dev) To use it, a driver must provide bottom functions for all of the three KMS entities. - - Helper Functions - - - int drm_crtc_helper_set_config(struct drm_mode_set *set); - - The drm_crtc_helper_set_config helper function - is a CRTC set_config implementation. It - first tries to locate the best encoder for each connector by calling - the connector best_encoder helper - operation. - - - After locating the appropriate encoders, the helper function will - call the mode_fixup encoder and CRTC helper - operations to adjust the requested mode, or reject it completely in - which case an error will be returned to the application. If the new - configuration after mode adjustment is identical to the current - configuration the helper function will return without performing any - other operation. - - - If the adjusted mode is identical to the current mode but changes to - the frame buffer need to be applied, the - drm_crtc_helper_set_config function will call - the CRTC mode_set_base helper operation. If - the adjusted mode differs from the current mode, or if the - mode_set_base helper operation is not - provided, the helper function performs a full mode set sequence by - calling the prepare, - mode_set and - commit CRTC and encoder helper operations, - in that order. - - - - void drm_helper_connector_dpms(struct drm_connector *connector, int mode); - - The drm_helper_connector_dpms helper function - is a connector dpms implementation that - tracks power state of connectors. To use the function, drivers must - provide dpms helper operations for CRTCs - and encoders to apply the DPMS state to the device. - - - The mid-layer doesn't track the power state of CRTCs and encoders. - The dpms helper operations can thus be - called with a mode identical to the currently active mode. - - - - int drm_helper_probe_single_connector_modes(struct drm_connector *connector, - uint32_t maxX, uint32_t maxY); - - The drm_helper_probe_single_connector_modes helper - function is a connector fill_modes - implementation that updates the connection status for the connector - and then retrieves a list of modes by calling the connector - get_modes helper operation. - - - If the helper operation returns no mode, and if the connector status - is connector_status_connected, standard VESA DMT modes up to - 1024x768 are automatically added to the modes list by a call to - drm_add_modes_noedid. - - - The function then filters out modes larger than - max_width and max_height - if specified. It finally calls the optional connector - mode_valid helper operation for each mode in - the probed list to check whether the mode is valid for the connector. - - - - - - CRTC Helper Operations - - - bool (*mode_fixup)(struct drm_crtc *crtc, - const struct drm_display_mode *mode, - struct drm_display_mode *adjusted_mode); - - Let CRTCs adjust the requested mode or reject it completely. This - operation returns true if the mode is accepted (possibly after being - adjusted) or false if it is rejected. - - - The mode_fixup operation should reject the - mode if it can't reasonably use it. The definition of "reasonable" - is currently fuzzy in this context. One possible behaviour would be - to set the adjusted mode to the panel timings when a fixed-mode - panel is used with hardware capable of scaling. Another behaviour - would be to accept any input mode and adjust it to the closest mode - supported by the hardware (FIXME: This needs to be clarified). - - - - int (*mode_set_base)(struct drm_crtc *crtc, int x, int y, - struct drm_framebuffer *old_fb) - - Move the CRTC on the current frame buffer (stored in - crtc->fb) to position (x,y). Any of the frame - buffer, x position or y position may have been modified. - - - This helper operation is optional. If not provided, the - drm_crtc_helper_set_config function will fall - back to the mode_set helper operation. - - - FIXME: Why are x and y passed as arguments, as they can be accessed - through crtc->x and - crtc->y? - - - - void (*prepare)(struct drm_crtc *crtc); - - Prepare the CRTC for mode setting. This operation is called after - validating the requested mode. Drivers use it to perform - device-specific operations required before setting the new mode. - - - - int (*mode_set)(struct drm_crtc *crtc, struct drm_display_mode *mode, - struct drm_display_mode *adjusted_mode, int x, int y, - struct drm_framebuffer *old_fb); - - Set a new mode, position and frame buffer. Depending on the device - requirements, the mode can be stored internally by the driver and - applied in the commit operation, or - programmed to the hardware immediately. - - - The mode_set operation returns 0 on success - or a negative error code if an error occurs. - - - - void (*commit)(struct drm_crtc *crtc); - - Commit a mode. This operation is called after setting the new mode. - Upon return the device must use the new mode and be fully - operational. - - - - - - Encoder Helper Operations - - - bool (*mode_fixup)(struct drm_encoder *encoder, - const struct drm_display_mode *mode, - struct drm_display_mode *adjusted_mode); - - Let encoders adjust the requested mode or reject it completely. This - operation returns true if the mode is accepted (possibly after being - adjusted) or false if it is rejected. See the - mode_fixup CRTC helper - operation for an explanation of the allowed adjustments. - - - - void (*prepare)(struct drm_encoder *encoder); - - Prepare the encoder for mode setting. This operation is called after - validating the requested mode. Drivers use it to perform - device-specific operations required before setting the new mode. - - - - void (*mode_set)(struct drm_encoder *encoder, - struct drm_display_mode *mode, - struct drm_display_mode *adjusted_mode); - - Set a new mode. Depending on the device requirements, the mode can - be stored internally by the driver and applied in the - commit operation, or programmed to the - hardware immediately. - - - - void (*commit)(struct drm_encoder *encoder); - - Commit a mode. This operation is called after setting the new mode. - Upon return the device must use the new mode and be fully - operational. - - - - - - Connector Helper Operations - - - struct drm_encoder *(*best_encoder)(struct drm_connector *connector); - - Return a pointer to the best encoder for the connecter. Device that - map connectors to encoders 1:1 simply return the pointer to the - associated encoder. This operation is mandatory. - - - - int (*get_modes)(struct drm_connector *connector); - - Fill the connector's probed_modes list - by parsing EDID data with drm_add_edid_modes, - adding standard VESA DMT modes with drm_add_modes_noedid, - or calling drm_mode_probed_add directly for every - supported mode and return the number of modes it has detected. This - operation is mandatory. - - - Note that the caller function will automatically add standard VESA - DMT modes up to 1024x768 if the get_modes - helper operation returns no mode and if the connector status is - connector_status_connected. There is no need to call - drm_add_edid_modes manually in that case. - - - When adding modes manually the driver creates each mode with a call to - drm_mode_create and must fill the following fields. - - - __u32 type; - - Mode type bitmask, a combination of - - - DRM_MODE_TYPE_BUILTIN - not used? - - - DRM_MODE_TYPE_CLOCK_C - not used? - - - DRM_MODE_TYPE_CRTC_C - not used? - - - - DRM_MODE_TYPE_PREFERRED - The preferred mode for the connector - - - not used? - - - - DRM_MODE_TYPE_DEFAULT - not used? - - - DRM_MODE_TYPE_USERDEF - not used? - - - DRM_MODE_TYPE_DRIVER - - - The mode has been created by the driver (as opposed to - to user-created modes). - - - - - Drivers must set the DRM_MODE_TYPE_DRIVER bit for all modes they - create, and set the DRM_MODE_TYPE_PREFERRED bit for the preferred - mode. - - - - __u32 clock; - Pixel clock frequency in kHz unit - - - __u16 hdisplay, hsync_start, hsync_end, htotal; - __u16 vdisplay, vsync_start, vsync_end, vtotal; - Horizontal and vertical timing information - <----------------><-------------><--------------> - - //////////////////////| - ////////////////////// | - ////////////////////// |.................. ................ - _______________ - - <----- [hv]display -----> - <------------- [hv]sync_start ------------> - <--------------------- [hv]sync_end ---------------------> - <-------------------------------- [hv]total -----------------------------> -]]> - - - __u16 hskew; - __u16 vscan; - Unknown - - - __u32 flags; - - Mode flags, a combination of - - - DRM_MODE_FLAG_PHSYNC - - Horizontal sync is active high - - - - DRM_MODE_FLAG_NHSYNC - - Horizontal sync is active low - - - - DRM_MODE_FLAG_PVSYNC - - Vertical sync is active high - - - - DRM_MODE_FLAG_NVSYNC - - Vertical sync is active low - - - - DRM_MODE_FLAG_INTERLACE - - Mode is interlaced - - - - DRM_MODE_FLAG_DBLSCAN - - Mode uses doublescan - - - - DRM_MODE_FLAG_CSYNC - - Mode uses composite sync - - - - DRM_MODE_FLAG_PCSYNC - - Composite sync is active high - - - - DRM_MODE_FLAG_NCSYNC - - Composite sync is active low - - - - DRM_MODE_FLAG_HSKEW - - hskew provided (not used?) - - - - DRM_MODE_FLAG_BCAST - - not used? - - - - DRM_MODE_FLAG_PIXMUX - - not used? - - - - DRM_MODE_FLAG_DBLCLK - - not used? - - - - DRM_MODE_FLAG_CLKDIV2 - - ? - - - - - - Note that modes marked with the INTERLACE or DBLSCAN flags will be - filtered out by - drm_helper_probe_single_connector_modes if - the connector's interlace_allowed or - doublescan_allowed field is set to 0. - - - - char name[DRM_DISPLAY_MODE_LEN]; - - Mode name. The driver must call - drm_mode_set_name to fill the mode name from - hdisplay, - vdisplay and interlace flag after - filling the corresponding fields. - - - - - - The vrefresh value is computed by - drm_helper_probe_single_connector_modes. - - - When parsing EDID data, drm_add_edid_modes fills the - connector display_info - width_mm and - height_mm fields. When creating modes - manually the get_modes helper operation must - set the display_info - width_mm and - height_mm fields if they haven't been set - already (for instance at initialization time when a fixed-size panel is - attached to the connector). The mode width_mm - and height_mm fields are only used internally - during EDID parsing and should not be set when creating modes manually. - - - - int (*mode_valid)(struct drm_connector *connector, - struct drm_display_mode *mode); - - Verify whether a mode is valid for the connector. Return MODE_OK for - supported modes and one of the enum drm_mode_status values (MODE_*) - for unsupported modes. This operation is optional. - - - As the mode rejection reason is currently not used beside for - immediately removing the unsupported mode, an implementation can - return MODE_BAD regardless of the exact reason why the mode is not - valid. - - - Note that the mode_valid helper operation is - only called for modes detected by the device, and - not for modes set by the user through the CRTC - set_config operation. - - - - Atomic Modeset Helper Functions Reference @@ -2327,8 +1596,12 @@ void intel_crt_init(struct drm_device *dev) !Edrivers/gpu/drm/drm_atomic_helper.c - Modeset Helper Functions Reference -!Iinclude/drm/drm_crtc_helper.h + Modeset Helper Reference for Common Vtables +!Iinclude/drm/drm_modeset_helper_vtables.h +!Pinclude/drm/drm_modeset_helper_vtables.h overview + + + Legacy CRTC/Modeset Helper Functions Reference !Edrivers/gpu/drm/drm_crtc_helper.c !Pdrivers/gpu/drm/drm_crtc_helper.c overview @@ -4039,92 +3312,6 @@ int num_ioctls; DPIO !Pdrivers/gpu/drm/i915/i915_reg.h DPIO - - Dual channel PHY (VLV/CHV/BXT) - - - - - - - - - - - - - - - - - - CH0 - CH1 - - - - - CMN/PLL/REF - CMN/PLL/REF - - - PCS01 - PCS23 - PCS01 - PCS23 - - - TX0 - TX1 - TX2 - TX3 - TX0 - TX1 - TX2 - TX3 - - - DDI0 - DDI1 - - - -
- - Single channel PHY (CHV/BXT) - - - - - - - - - - - CH0 - - - - - CMN/PLL/REF - - - PCS01 - PCS23 - - - TX0 - TX1 - TX2 - TX3 - - - DDI2 - - - -
@@ -4201,17 +3388,21 @@ int num_ioctls;
- GuC-based Command Submission + GuC - GuC + GuC-specific firmware loader !Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader !Idrivers/gpu/drm/i915/intel_guc_loader.c - GuC Client -!Pdrivers/gpu/drm/i915/i915_guc_submission.c GuC-based command submissison + GuC-based command submission +!Pdrivers/gpu/drm/i915/i915_guc_submission.c GuC-based command submission !Idrivers/gpu/drm/i915/i915_guc_submission.c + + GuC Firmware Layout +!Pdrivers/gpu/drm/i915/intel_guc_fwif.h GuC Firmware Layout + @@ -4246,41 +3437,63 @@ int num_ioctls; Modes of Use - - Manual switching and manual power control + + Manual switching and manual power control !Pdrivers/gpu/vga/vga_switcheroo.c Manual switching and manual power control - - - Driver power control + + + Driver power control !Pdrivers/gpu/vga/vga_switcheroo.c Driver power control - + - - Public functions + + API + + Public functions !Edrivers/gpu/vga/vga_switcheroo.c - - - - Public structures + + + Public structures !Finclude/linux/vga_switcheroo.h vga_switcheroo_handler !Finclude/linux/vga_switcheroo.h vga_switcheroo_client_ops - - - - Public constants + + + Public constants !Finclude/linux/vga_switcheroo.h vga_switcheroo_client_id !Finclude/linux/vga_switcheroo.h vga_switcheroo_state - - - - Private structures + + + Private structures !Fdrivers/gpu/vga/vga_switcheroo.c vgasr_priv !Fdrivers/gpu/vga/vga_switcheroo.c vga_switcheroo_client + + + + + Handlers + + apple-gmux Handler +!Pdrivers/platform/x86/apple-gmux.c Overview +!Pdrivers/platform/x86/apple-gmux.c Interrupt + + Graphics mux +!Pdrivers/platform/x86/apple-gmux.c Graphics mux + + + Power control +!Pdrivers/platform/x86/apple-gmux.c Power control + + + Backlight control +!Pdrivers/platform/x86/apple-gmux.c Backlight control + + !Cdrivers/gpu/vga/vga_switcheroo.c !Cinclude/linux/vga_switcheroo.h +!Cdrivers/platform/x86/apple-gmux.c diff --git a/Documentation/DocBook/iio.tmpl b/Documentation/DocBook/iio.tmpl index 98be32267..f525bf56d 100644 --- a/Documentation/DocBook/iio.tmpl +++ b/Documentation/DocBook/iio.tmpl @@ -458,7 +458,7 @@ .scan_type = { .sign = 's', .realbits = 12, - .storgebits = 16, + .storagebits = 16, .shift = 4, .endianness = IIO_LE, }, diff --git a/Documentation/DocBook/media/Makefile b/Documentation/DocBook/media/Makefile index 08527e7ea..2840ff483 100644 --- a/Documentation/DocBook/media/Makefile +++ b/Documentation/DocBook/media/Makefile @@ -199,8 +199,10 @@ DVB_DOCUMENTED = \ # install_media_images = \ - $(Q)-mkdir $(MEDIA_OBJ_DIR)/media_api; \ - cp $(OBJIMGFILES) $(MEDIA_SRC_DIR)/*.svg $(MEDIA_SRC_DIR)/v4l/*.svg $(MEDIA_OBJ_DIR)/media_api + $(Q)if [ "x$(findstring media_api.xml,$(DOCBOOKS))" != "x" ]; then \ + mkdir -p $(MEDIA_OBJ_DIR)/media_api; \ + cp $(OBJIMGFILES) $(MEDIA_SRC_DIR)/*.svg $(MEDIA_SRC_DIR)/v4l/*.svg $(MEDIA_OBJ_DIR)/media_api; \ + fi $(MEDIA_OBJ_DIR)/%: $(MEDIA_SRC_DIR)/%.b64 $(Q)base64 -d $< >$@ diff --git a/Documentation/DocBook/media/dvb/dvbproperty.xml b/Documentation/DocBook/media/dvb/dvbproperty.xml index 08227d4e9..e579ae508 100644 --- a/Documentation/DocBook/media/dvb/dvbproperty.xml +++ b/Documentation/DocBook/media/dvb/dvbproperty.xml @@ -76,7 +76,7 @@ int main(void) NOTE: While it is possible to directly call the Kernel code like the above example, it is strongly recommended to use - libdvbv5, + libdvbv5, as it provides abstraction to work with the supported digital TV standards and provides methods for usual operations like program scanning and to read/write channel descriptor files. diff --git a/Documentation/DocBook/media/dvb/examples.xml b/Documentation/DocBook/media/dvb/examples.xml index c9f68c718..837fb3b64 100644 --- a/Documentation/DocBook/media/dvb/examples.xml +++ b/Documentation/DocBook/media/dvb/examples.xml @@ -3,7 +3,7 @@ NOTE: This section is out of date, and the code below won't even compile. Please refer to the - libdvbv5 + libdvbv5 for updated/recommended examples. diff --git a/Documentation/DocBook/media/dvb/intro.xml b/Documentation/DocBook/media/dvb/intro.xml index 51db15648..b5b701f5d 100644 --- a/Documentation/DocBook/media/dvb/intro.xml +++ b/Documentation/DocBook/media/dvb/intro.xml @@ -32,7 +32,7 @@ and filtering several section and PES data streams at the same time. new standard Linux DVB API. As a commitment to the development of terminals based on open standards, Nokia and Convergence made it available to all Linux developers and published it on - in September 2000. + in September 2000. Convergence is the maintainer of the Linux DVB API. Together with the LinuxTV community (i.e. you, the reader of this document), the Linux DVB API will be constantly reviewed and improved. With the Linux driver for diff --git a/Documentation/DocBook/media/v4l/capture.c.xml b/Documentation/DocBook/media/v4l/capture.c.xml index 1c5c49a2d..22126a991 100644 --- a/Documentation/DocBook/media/v4l/capture.c.xml +++ b/Documentation/DocBook/media/v4l/capture.c.xml @@ -5,7 +5,7 @@ * This program can be used and distributed without restrictions. * * This program is provided with the V4L2 API - * see http://linuxtv.org/docs.php for more information + * see https://linuxtv.org/docs.php for more information */ #include <stdio.h> diff --git a/Documentation/DocBook/media/v4l/compat.xml b/Documentation/DocBook/media/v4l/compat.xml index 5701a08ed..5399e8904 100644 --- a/Documentation/DocBook/media/v4l/compat.xml +++ b/Documentation/DocBook/media/v4l/compat.xml @@ -2666,7 +2666,7 @@ is useful to display images captured with V4L2 devices. V4L2 does not support digital terrestrial, cable or satellite broadcast. A separate project aiming at digital receivers exists. You can find its homepage at http://linuxtv.org. The Linux DVB API +url="https://linuxtv.org">https://linuxtv.org. The Linux DVB API has no connection to the V4L2 API except that drivers for hybrid hardware may support both. diff --git a/Documentation/DocBook/media/v4l/io.xml b/Documentation/DocBook/media/v4l/io.xml index da654031e..144158b3a 100644 --- a/Documentation/DocBook/media/v4l/io.xml +++ b/Documentation/DocBook/media/v4l/io.xml @@ -699,7 +699,7 @@ linkend="v4l2-buf-type" /> buffer. It depends on the negotiated data format and may change with each buffer for compressed variable size data like JPEG images. Drivers must set this field when type -refers to an input stream, applications when it refers to an output stream. +refers to a capture stream, applications when it refers to an output stream. If the application sets this to 0 for an output stream, then bytesused will be set to the size of the buffer (see the length field of this struct) by @@ -720,14 +720,14 @@ linkend="buffer-flags" />. Indicates the field order of the image in the buffer, see . This field is not used when the buffer contains VBI data. Drivers must set it when -type refers to an input stream, +type refers to a capture stream, applications when it refers to an output stream. struct timeval timestamp - For input streams this is time when the first data + For capture streams this is time when the first data byte was captured, as returned by the clock_gettime() function for the relevant clock id; see V4L2_BUF_FLAG_TIMESTAMP_* in @@ -866,7 +866,7 @@ must set this to 0. The number of bytes occupied by data in the plane (its payload). Drivers must set this field when type - refers to an input stream, applications when it refers to an output stream. + refers to a capture stream, applications when it refers to an output stream. If the application sets this to 0 for an output stream, then bytesused will be set to the size of the plane (see the length field of this struct) @@ -919,7 +919,7 @@ must set this to 0. Offset in bytes to video data in the plane. Drivers must set this field when type - refers to an input stream, applications when it refers to an output stream. + refers to a capture stream, applications when it refers to an output stream. Note that data_offset is included in bytesused. So the size of the image in the plane is bytesused-data_offset at diff --git a/Documentation/DocBook/media/v4l/media-controller.xml b/Documentation/DocBook/media/v4l/media-controller.xml index 873ac3a62..5f2fc07a9 100644 --- a/Documentation/DocBook/media/v4l/media-controller.xml +++ b/Documentation/DocBook/media/v4l/media-controller.xml @@ -58,21 +58,36 @@ Media device model Discovering a device internal topology, and configuring it at runtime, is one of the goals of the media controller API. To achieve this, hardware - devices are modelled as an oriented graph of building blocks called entities - connected through pads. - An entity is a basic media hardware or software building block. It can - correspond to a large variety of logical blocks such as physical hardware - devices (CMOS sensor for instance), logical hardware devices (a building - block in a System-on-Chip image processing pipeline), DMA channels or - physical connectors. - A pad is a connection endpoint through which an entity can interact - with other entities. Data (not restricted to video) produced by an entity - flows from the entity's output to one or more entity inputs. Pads should not - be confused with physical pins at chip boundaries. - A link is a point-to-point oriented connection between two pads, - either on the same entity or on different entities. Data flows from a source - pad to a sink pad. + devices and Linux Kernel interfaces are modelled as graph objects on + an oriented graph. The object types that constitute the graph are: + + An entity + is a basic media hardware or software building block. It can correspond to + a large variety of logical blocks such as physical hardware devices + (CMOS sensor for instance), logical hardware devices (a building block in + a System-on-Chip image processing pipeline), DMA channels or physical + connectors. + An interface + is a graph representation of a Linux Kernel userspace API interface, + like a device node or a sysfs file that controls one or more entities + in the graph. + A pad + is a data connection endpoint through which an entity can interact with + other entities. Data (not restricted to video) produced by an entity + flows from the entity's output to one or more entity inputs. Pads should + not be confused with physical pins at chip boundaries. + A data link + is a point-to-point oriented connection between two pads, either on the + same entity or on different entities. Data flows from a source pad to a + sink pad. + An interface link + is a point-to-point bidirectional control connection between a Linux + Kernel interface and an entity.m + + + + &sub-media-types; @@ -83,6 +98,7 @@ &sub-media-func-ioctl; &sub-media-ioc-device-info; + &sub-media-ioc-g-topology; &sub-media-ioc-enum-entities; &sub-media-ioc-enum-links; &sub-media-ioc-setup-link; diff --git a/Documentation/DocBook/media/v4l/media-ioc-enum-entities.xml b/Documentation/DocBook/media/v4l/media-ioc-enum-entities.xml index 5872f8bbf..0c4f96bfc 100644 --- a/Documentation/DocBook/media/v4l/media-ioc-enum-entities.xml +++ b/Documentation/DocBook/media/v4l/media-ioc-enum-entities.xml @@ -59,15 +59,6 @@ Entity IDs can be non-contiguous. Applications must not try to enumerate entities by calling MEDIA_IOC_ENUM_ENTITIES with increasing id's until they get an error. - Two or more entities that share a common non-zero - group_id value are considered as logically - grouped. Groups are used to report - - ALSA, VBI and video nodes that carry the same media - stream - lens and flash controllers associated with a sensor - - struct <structname>media_entity_desc</structname> @@ -106,7 +97,7 @@ revision - Entity revision in a driver/hardware specific format. + Entity revision. Always zero (obsolete) __u32 @@ -120,7 +111,7 @@ group_id - Entity group ID + Entity group ID. Always zero (obsolete) __u16 @@ -171,97 +162,6 @@
- - - Media entity types - - - - - - MEDIA_ENT_T_DEVNODE - Unknown device node - - - MEDIA_ENT_T_DEVNODE_V4L - V4L video, radio or vbi device node - - - MEDIA_ENT_T_DEVNODE_FB - Frame buffer device node - - - MEDIA_ENT_T_DEVNODE_ALSA - ALSA card - - - MEDIA_ENT_T_DEVNODE_DVB_FE - DVB frontend devnode - - - MEDIA_ENT_T_DEVNODE_DVB_DEMUX - DVB demux devnode - - - MEDIA_ENT_T_DEVNODE_DVB_DVR - DVB DVR devnode - - - MEDIA_ENT_T_DEVNODE_DVB_CA - DVB CAM devnode - - - MEDIA_ENT_T_DEVNODE_DVB_NET - DVB network devnode - - - MEDIA_ENT_T_V4L2_SUBDEV - Unknown V4L sub-device - - - MEDIA_ENT_T_V4L2_SUBDEV_SENSOR - Video sensor - - - MEDIA_ENT_T_V4L2_SUBDEV_FLASH - Flash controller - - - MEDIA_ENT_T_V4L2_SUBDEV_LENS - Lens controller - - - MEDIA_ENT_T_V4L2_SUBDEV_DECODER - Video decoder, the basic function of the video decoder is to - accept analogue video from a wide variety of sources such as - broadcast, DVD players, cameras and video cassette recorders, in - either NTSC, PAL or HD format and still occasionally SECAM, separate - it into its component parts, luminance and chrominance, and output - it in some digital video standard, with appropriate embedded timing - signals. - - - MEDIA_ENT_T_V4L2_SUBDEV_TUNER - TV and/or radio tuner - - - -
- - - Media entity flags - - - - - - MEDIA_ENT_FL_DEFAULT - Default entity for its type. Used to discover the default - audio, VBI and video devices, the default camera sensor, ... - - - -
diff --git a/Documentation/DocBook/media/v4l/media-ioc-enum-links.xml b/Documentation/DocBook/media/v4l/media-ioc-enum-links.xml index 74fb394ec..2bbeea9f3 100644 --- a/Documentation/DocBook/media/v4l/media-ioc-enum-links.xml +++ b/Documentation/DocBook/media/v4l/media-ioc-enum-links.xml @@ -118,35 +118,6 @@ - - Media pad flags - - - - - - MEDIA_PAD_FL_SINK - Input pad, relative to the entity. Input pads sink data and - are targets of links. - - - MEDIA_PAD_FL_SOURCE - Output pad, relative to the entity. Output pads source data - and are origins of links. - - - MEDIA_PAD_FL_MUST_CONNECT - If this flag is set and the pad is linked to any other - pad, then at least one of those links must be enabled for the - entity to be able to stream. There could be temporary reasons - (e.g. device configuration dependent) for the pad to need - enabled links even when this flag isn't set; the absence of the - flag doesn't imply there is none. - - - -
- struct <structname>media_link_desc</structname> @@ -171,33 +142,6 @@ - - Media link flags - - - - - - MEDIA_LNK_FL_ENABLED - The link is enabled and can be used to transfer media data. - When two or more links target a sink pad, only one of them can be - enabled at a time. - - - MEDIA_LNK_FL_IMMUTABLE - The link enabled state can't be modified at runtime. An - immutable link is always enabled. - - - MEDIA_LNK_FL_DYNAMIC - The link enabled state can be modified during streaming. This - flag is set by drivers and is read-only for applications. - - - - - One and only one of MEDIA_PAD_FL_SINK and - MEDIA_PAD_FL_SOURCE must be set for every pad.
diff --git a/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml b/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml new file mode 100644 index 000000000..63152ab9e --- /dev/null +++ b/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml @@ -0,0 +1,394 @@ + + + ioctl MEDIA_IOC_G_TOPOLOGY + &manvol; + + + + MEDIA_IOC_G_TOPOLOGY + Enumerate the graph topology and graph element properties + + + + + + int ioctl + int fd + int request + struct media_v2_topology *argp + + + + + + Arguments + + + + fd + + File descriptor returned by + open(). + + + + request + + MEDIA_IOC_G_TOPOLOGY + + + + argp + + + + + + + + + Description + + NOTE: This new ioctl is programmed to be added on Kernel 4.6. Its definition/arguments may change until its final version. + + The typical usage of this ioctl is to call it twice. + On the first call, the structure defined at &media-v2-topology; should + be zeroed. At return, if no errors happen, this ioctl will return the + topology_version and the total number of entities, + interfaces, pads and links. + Before the second call, the userspace should allocate arrays to + store the graph elements that are desired, putting the pointers to them + at the ptr_entities, ptr_interfaces, ptr_links and/or ptr_pads, keeping + the other values untouched. + If the topology_version remains the same, the + ioctl should fill the desired arrays with the media graph elements. + + + struct <structname>media_v2_topology</structname> + + + + + + + + + __u64 + topology_version + + + Version of the media graph topology. When the graph is + created, this field starts with zero. Every time a graph + element is added or removed, this field is + incremented. + + + __u64 + num_entities + + + Number of entities in the graph + + + __u64 + ptr_entities + + + A pointer to a memory area where the entities array + will be stored, converted to a 64-bits integer. + It can be zero. if zero, the ioctl won't store the + entities. It will just update + num_entities + + + __u64 + num_interfaces + + + Number of interfaces in the graph + + + __u64 + ptr_interfaces + + + A pointer to a memory area where the interfaces array + will be stored, converted to a 64-bits integer. + It can be zero. if zero, the ioctl won't store the + interfaces. It will just update + num_interfaces + + + __u64 + num_pads + + + Total number of pads in the graph + + + __u64 + ptr_pads + + + A pointer to a memory area where the pads array + will be stored, converted to a 64-bits integer. + It can be zero. if zero, the ioctl won't store the + pads. It will just update + num_pads + + + __u64 + num_links + + + Total number of data and interface links in the graph + + + __u64 + ptr_links + + + A pointer to a memory area where the links array + will be stored, converted to a 64-bits integer. + It can be zero. if zero, the ioctl won't store the + links. It will just update + num_links + + + +
+ + + struct <structname>media_v2_entity</structname> + + + + + + + + + __u32 + id + + + Unique ID for the entity. + + + char + name[64] + + + Entity name as an UTF-8 NULL-terminated string. + + + __u32 + function + + + Entity main function, see for details. + + + __u32 + reserved[12] + Reserved for future extensions. Drivers and applications must + set this array to zero. + + + +
+ + + struct <structname>media_v2_interface</structname> + + + + + + + + + __u32 + id + + + Unique ID for the interface. + + + __u32 + intf_type + + + Interface type, see for details. + + + __u32 + flags + + + Interface flags. Currently unused. + + + __u32 + reserved[9] + + + Reserved for future extensions. Drivers and applications must + set this array to zero. + + + struct media_v2_intf_devnode + devnode + + + Used only for device node interfaces. See for details.. + + + +
+ + + struct <structname>media_v2_interface</structname> + + + + + + + + + __u32 + major + + + Device node major number. + + + __u32 + minor + + + Device node minor number. + + + +
+ + + struct <structname>media_v2_pad</structname> + + + + + + + + + __u32 + id + + + Unique ID for the pad. + + + __u32 + entity_id + + + Unique ID for the entity where this pad belongs. + + + __u32 + flags + + + Pad flags, see for more details. + + + __u32 + reserved[9] + + + Reserved for future extensions. Drivers and applications must + set this array to zero. + + + +
+ + + struct <structname>media_v2_pad</structname> + + + + + + + + + __u32 + id + + + Unique ID for the pad. + + + __u32 + source_id + + + + On pad to pad links: unique ID for the source pad. + On interface to entity links: unique ID for the interface. + + + + __u32 + sink_id + + + + On pad to pad links: unique ID for the sink pad. + On interface to entity links: unique ID for the entity. + + + + __u32 + flags + + + Link flags, see for more details. + + + __u32 + reserved[5] + + + Reserved for future extensions. Drivers and applications must + set this array to zero. + + + + + +
+ + + &return-value; + + + + ENOSPC + + This is returned when either one or more of the num_entities, + num_interfaces, num_links or num_pads are non-zero and are smaller + than the actual number of elements inside the graph. This may happen + if the topology_version changed when compared + to the last time this ioctl was called. Userspace should usually + free the area for the pointers, zero the struct elements and call + this ioctl again. + + + + +
diff --git a/Documentation/DocBook/media/v4l/media-types.xml b/Documentation/DocBook/media/v4l/media-types.xml new file mode 100644 index 000000000..0ee0f3386 --- /dev/null +++ b/Documentation/DocBook/media/v4l/media-types.xml @@ -0,0 +1,236 @@ +
+Types and flags used to represent the media graph elements + + + Media entity types + + + + + + MEDIA_ENT_F_UNKNOWN and MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN + Unknown entity. That generally indicates that + a driver didn't initialize properly the entity, with is a Kernel bug + + + MEDIA_ENT_F_IO_V4L + Data streaming input and/or output entity. + + + MEDIA_ENT_F_IO_VBI + V4L VBI streaming input or output entity + + + MEDIA_ENT_F_IO_SWRADIO + V4L Software Digital Radio (SDR) streaming input or output entity + + + MEDIA_ENT_F_IO_DTV + DVB Digital TV streaming input or output entity + + + MEDIA_ENT_F_DTV_DEMOD + Digital TV demodulator entity. + + + MEDIA_ENT_F_TS_DEMUX + MPEG Transport stream demux entity. Could be implemented on hardware or in Kernelspace by the Linux DVB subsystem. + + + MEDIA_ENT_F_DTV_CA + Digital TV Conditional Access module (CAM) entity + + + MEDIA_ENT_F_DTV_NET_DECAP + Digital TV network ULE/MLE desencapsulation entity. Could be implemented on hardware or in Kernelspace + + + MEDIA_ENT_F_CONN_RF + Connector for a Radio Frequency (RF) signal. + + + MEDIA_ENT_F_CONN_SVIDEO + Connector for a S-Video signal. + + + MEDIA_ENT_F_CONN_COMPOSITE + Connector for a RGB composite signal. + + + MEDIA_ENT_F_CAM_SENSOR + Camera video sensor entity. + + + MEDIA_ENT_F_FLASH + Flash controller entity. + + + MEDIA_ENT_F_LENS + Lens controller entity. + + + MEDIA_ENT_F_ATV_DECODER + Analog video decoder, the basic function of the video decoder + is to accept analogue video from a wide variety of sources such as + broadcast, DVD players, cameras and video cassette recorders, in + either NTSC, PAL, SECAM or HD format, separating the stream + into its component parts, luminance and chrominance, and output + it in some digital video standard, with appropriate timing + signals. + + + MEDIA_ENT_F_TUNER + Digital TV, analog TV, radio and/or software radio tuner. + + + +
+ + + Media entity flags + + + + + + MEDIA_ENT_FL_DEFAULT + Default entity for its type. Used to discover the default + audio, VBI and video devices, the default camera sensor, ... + + + MEDIA_ENT_FL_CONNECTOR + The entity represents a data conector + + + +
+ + + Media interface types + + + + + + + MEDIA_INTF_T_DVB_FE + Device node interface for the Digital TV frontend + typically, /dev/dvb/adapter?/frontend? + + + MEDIA_INTF_T_DVB_DEMUX + Device node interface for the Digital TV demux + typically, /dev/dvb/adapter?/demux? + + + MEDIA_INTF_T_DVB_DVR + Device node interface for the Digital TV DVR + typically, /dev/dvb/adapter?/dvr? + + + MEDIA_INTF_T_DVB_CA + Device node interface for the Digital TV Conditional Access + typically, /dev/dvb/adapter?/ca? + + + MEDIA_INTF_T_DVB_FE + Device node interface for the Digital TV network control + typically, /dev/dvb/adapter?/net? + + + MEDIA_INTF_T_V4L_VIDEO + Device node interface for video (V4L) + typically, /dev/video? + + + MEDIA_INTF_T_V4L_VBI + Device node interface for VBI (V4L) + typically, /dev/vbi? + + + MEDIA_INTF_T_V4L_RADIO + Device node interface for radio (V4L) + typically, /dev/vbi? + + + MEDIA_INTF_T_V4L_SUBDEV + Device node interface for a V4L subdevice + typically, /dev/v4l-subdev? + + + MEDIA_INTF_T_V4L_SWRADIO + Device node interface for Software Defined Radio (V4L) + typically, /dev/swradio? + + + +
+ + + Media pad flags + + + + + + MEDIA_PAD_FL_SINK + Input pad, relative to the entity. Input pads sink data and + are targets of links. + + + MEDIA_PAD_FL_SOURCE + Output pad, relative to the entity. Output pads source data + and are origins of links. + + + MEDIA_PAD_FL_MUST_CONNECT + If this flag is set and the pad is linked to any other + pad, then at least one of those links must be enabled for the + entity to be able to stream. There could be temporary reasons + (e.g. device configuration dependent) for the pad to need + enabled links even when this flag isn't set; the absence of the + flag doesn't imply there is none. + + + +
+ + One and only one of MEDIA_PAD_FL_SINK and + MEDIA_PAD_FL_SOURCE must be set for every pad. + + + Media link flags + + + + + + MEDIA_LNK_FL_ENABLED + The link is enabled and can be used to transfer media data. + When two or more links target a sink pad, only one of them can be + enabled at a time. + + + MEDIA_LNK_FL_IMMUTABLE + The link enabled state can't be modified at runtime. An + immutable link is always enabled. + + + MEDIA_LNK_FL_DYNAMIC + The link enabled state can be modified during streaming. This + flag is set by drivers and is read-only for applications. + + + MEDIA_LNK_FL_LINK_TYPE + This is a bitmask that defines the type of the link. + Currently, two types of links are supported: + MEDIA_LNK_FL_DATA_LINK + if the link is between two pads + MEDIA_LNK_FL_INTERFACE_LINK + if the link is between an interface and an entity + + + + + +
diff --git a/Documentation/DocBook/media/v4l/v4l2.xml b/Documentation/DocBook/media/v4l/v4l2.xml index 7e6164335..42e626d6c 100644 --- a/Documentation/DocBook/media/v4l/v4l2.xml +++ b/Documentation/DocBook/media/v4l/v4l2.xml @@ -151,6 +151,16 @@ Rubli, Andy Walls, Muralidharan Karicheri, Mauro Carvalho Chehab, structs, ioctls) must be noted in more detail in the history chapter (compat.xml), along with the possible impact on existing drivers and applications. --> + + 4.5 + 2015-10-29 + rr + Extend vidioc-g-ext-ctrls;. Replace ctrl_class with a new +union with ctrl_class and which. Which is used to select the current value of +the control or the default value. + + + 4.4 2015-05-26 diff --git a/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml b/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml index 8ffe74f84..d81fa0d40 100644 --- a/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml +++ b/Documentation/DocBook/media/v4l/vidioc-create-bufs.xml @@ -58,7 +58,7 @@ This ioctl is used to create buffers for memory mapped or user pointer or DMA buffer I/O. It can be used as an alternative or in -addition to the VIDIOC_REQBUFS ioctl, when a tighter +addition to the &VIDIOC-REQBUFS; ioctl, when a tighter control over buffers is required. This ioctl can be called multiple times to create buffers of different sizes. @@ -71,30 +71,28 @@ zeroed. The format field specifies the image format that the buffers must be able to handle. The application has to fill in this -&v4l2-format;. Usually this will be done using the -VIDIOC_TRY_FMT or VIDIOC_G_FMT ioctl() -to ensure that the requested format is supported by the driver. Unsupported -formats will result in an error. +&v4l2-format;. Usually this will be done using the &VIDIOC-TRY-FMT; or &VIDIOC-G-FMT; ioctls +to ensure that the requested format is supported by the driver. +Based on the format's type field the requested buffer +size (for single-planar) or plane sizes (for multi-planar formats) will be +used for the allocated buffers. The driver may return an error if the size(s) +are not supported by the hardware (usually because they are too small). The buffers created by this ioctl will have as minimum size the size -defined by the format.pix.sizeimage field. If the +defined by the format.pix.sizeimage field (or the +corresponding fields for other format types). Usually if the format.pix.sizeimage field is less than the minimum -required for the given format, then sizeimage will be -increased by the driver to that minimum to allocate the buffers. If it is -larger, then the value will be used as-is. The same applies to the -sizeimage field of the -v4l2_plane_pix_format structure in the case of -multiplanar formats. +required for the given format, then an error will be returned since drivers will +typically not allow this. If it is larger, then the value will be used as-is. +In other words, the driver may reject the requested size, but if it is accepted +the driver will use it unchanged. When the ioctl is called with a pointer to this structure the driver will attempt to allocate up to the requested number of buffers and store the actual number allocated and the starting index in the count and the index fields respectively. On return count can be smaller than -the number requested. The driver may also increase buffer sizes if required, -however, it will not update sizeimage field values. -The user has to use VIDIOC_QUERYBUF to retrieve that -information. +the number requested. struct <structname>v4l2_create_buffers</structname> diff --git a/Documentation/DocBook/media/v4l/vidioc-dbg-g-chip-info.xml b/Documentation/DocBook/media/v4l/vidioc-dbg-g-chip-info.xml index 4c4603c13..f14a3bb1a 100644 --- a/Documentation/DocBook/media/v4l/vidioc-dbg-g-chip-info.xml +++ b/Documentation/DocBook/media/v4l/vidioc-dbg-g-chip-info.xml @@ -99,7 +99,7 @@ if the driver supports writing registers to the device.We recommended the v4l2-dbg utility over calling this ioctl directly. It is available from the LinuxTV v4l-dvb repository; see http://linuxtv.org/repo/ for +url="https://linuxtv.org/repo/">https://linuxtv.org/repo/ for access instructions. -http://www.linuxtv.org/lists.php"> +https://linuxtv.org/lists.php"> -http://linuxtv.org/repo/"> +https://linuxtv.org/repo/"> --------"> ----------"> ------------"> @@ -91,7 +91,7 @@ components, like mixers, PCM capture, PCM playback, etc, which are controlled via ALSA API.For additional information and for the latest development code, - see: http://linuxtv.org. + see: https://linuxtv.org.For discussing improvements, reporting troubles, sending new drivers, etc, please mail to: Linux Media Mailing List (LMML).. diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 7da8f0402..b442921bc 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl @@ -162,12 +162,15 @@ Basic defines - At least you have to provide a mtd structure and - a storage for the ioremap'ed chip address. - You can allocate the mtd structure using kmalloc - or you can allocate it statically. - In case of static allocation you have to allocate - a nand_chip structure too. + At least you have to provide a nand_chip structure + and a storage for the ioremap'ed chip address. + You can allocate the nand_chip structure using + kmalloc or you can allocate it statically. + The NAND chip structure embeds an mtd structure + which will be registered to the MTD subsystem. + You can extract a pointer to the mtd structure + from a nand_chip pointer using the nand_to_mtd() + helper. Kmalloc based example @@ -180,7 +183,6 @@ static void __iomem *baseaddr; Static example -static struct mtd_info board_mtd; static struct nand_chip board_chip; static void __iomem *baseaddr; @@ -235,7 +237,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd) static void board_hwcontrol(struct mtd_info *mtd, int cmd) { - struct nand_chip *this = (struct nand_chip *) mtd->priv; + struct nand_chip *this = mtd_to_nand(mtd); switch(cmd){ case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; @@ -274,13 +276,15 @@ static int __init board_init (void) int err = 0; /* Allocate memory for MTD device structure and private data */ - board_mtd = kzalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL); - if (!board_mtd) { + this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL); + if (!this) { printk ("Unable to allocate NAND MTD device structure.\n"); err = -ENOMEM; goto out; } + board_mtd = nand_to_mtd(this); + /* map physical address */ baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024); if (!baseaddr) { @@ -289,11 +293,6 @@ static int __init board_init (void) goto out_mtd; } - /* Get pointer to private data */ - this = (struct nand_chip *) (); - /* Link the private data with the MTD structure */ - board_mtd->priv = this; - /* Set address of NAND IO lines */ this->IO_ADDR_R = baseaddr; this->IO_ADDR_W = baseaddr; @@ -317,7 +316,7 @@ static int __init board_init (void) out_ior: iounmap(baseaddr); out_mtd: - kfree (board_mtd); + kfree (this); out: return err; } @@ -343,7 +342,7 @@ static void __exit board_cleanup (void) iounmap(baseaddr); /* Free the MTD device structure */ - kfree (board_mtd); + kfree (mtd_to_nand(board_mtd)); } module_exit(board_cleanup); #endif @@ -399,7 +398,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) static void board_select_chip (struct mtd_info *mtd, int chip) { - struct nand_chip *this = (struct nand_chip *) mtd->priv; + struct nand_chip *this = mtd_to_nand(mtd); /* Deselect all chips */ this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; diff --git a/Documentation/HOWTO b/Documentation/HOWTO index 21152d397..d5a699d5a 100644 --- a/Documentation/HOWTO +++ b/Documentation/HOWTO @@ -209,7 +209,7 @@ tools. One such tool that is particularly recommended is the Linux Cross-Reference project, which is able to present source code in a self-referential, indexed webpage format. An excellent up-to-date repository of the kernel code may be found at: - http://lxr.linux.no/+trees + http://lxr.free-electrons.com/ The development process diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt index 7b57fc087..49585b6e1 100644 --- a/Documentation/Intel-IOMMU.txt +++ b/Documentation/Intel-IOMMU.txt @@ -3,7 +3,7 @@ Linux IOMMU Support The architecture spec can be obtained from the below location. -http://www.intel.com/technology/virtualization/ +http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf This guide gives a quick cheat sheet for some basic understanding. diff --git a/Documentation/Makefile b/Documentation/Makefile index bc0548201..1207d7907 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -1,4 +1,4 @@ subdir-y := accounting auxdisplay blackfin connector \ filesystems filesystems ia64 laptops mic misc-devices \ - networking pcmcia prctl ptp spi timers vDSO video4linux \ + networking pcmcia prctl ptp timers vDSO video4linux \ watchdog diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png new file mode 100644 index 000000000..7496a55e4 Binary files /dev/null and b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png differ diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg new file mode 100644 index 000000000..4b4014fda --- /dev/null +++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg @@ -0,0 +1,374 @@ + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + synchronize_rcu() + + + + + + + WRITE_ONCE(a, 1); + WRITE_ONCE(b, 1); + r1 = READ_ONCE(a); + WRITE_ONCE(c, 1); + r2 = READ_ONCE(b); + r3 = READ_ONCE(c); + thread0() + thread1() + thread2() + + + + rcu_read_lock(); + rcu_read_lock(); + rcu_read_unlock(); + rcu_read_unlock(); + + QS + + QS + + + QS + + diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg new file mode 100644 index 000000000..ebcbeee39 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg @@ -0,0 +1,237 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + Read-Mostly, Stale & + + Inconsistent Data OK + + (RCU Works Great!!!) + + (RCU Works Well) + + Read-Mostly, Need Consistent Data + + Read-Write, Need Consistent Data + + Update-Mostly, Need Consistent Data + + (RCU Might Be OK...) + + (1) Provide Existence Guarantees For Update-Friendly Mechanisms + + (2) Provide Wait-Free Read-Side Primitives for Real-Time Use) + + (RCU is Very Unlikely to be the Right Tool For The Job, But it Can: + + diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg new file mode 100644 index 000000000..48cd1623d --- /dev/null +++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg @@ -0,0 +1,639 @@ + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + synchronize_rcu() + + + + + + + WRITE_ONCE(a, 1); + WRITE_ONCE(b, 1); + r1 = READ_ONCE(a); + WRITE_ONCE(c, 1); + WRITE_ONCE(d, 1); + r2 = READ_ONCE(c); + thread0() + thread1() + thread2() + + + + rcu_read_lock(); + rcu_read_lock(); + rcu_read_unlock(); + rcu_read_unlock(); + + QS + + QS + + + QS + + + + synchronize_rcu() + + + + + + + r3 = READ_ONCE(d); + WRITE_ONCE(e, 1); + + QS + r4 = READ_ONCE(b); + r5 = READ_ONCE(e); + rcu_read_lock(); + rcu_read_unlock(); + QS + + QS + + QS + + thread3() + thread4() + + diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html new file mode 100644 index 000000000..a725f9900 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -0,0 +1,2897 @@ + + + + + A Tour Through RCU's Requirements [LWN.net] + + +

A Tour Through RCU's Requirements

+ +

Copyright IBM Corporation, 2015

+

Author: Paul E. McKenney

+

The initial version of this document appeared in the +LWN articles +here, +here, and +here.

+ +

Introduction

+ +

+Read-copy update (RCU) is a synchronization mechanism that is often +used as a replacement for reader-writer locking. +RCU is unusual in that updaters do not block readers, +which means that RCU's read-side primitives can be exceedingly fast +and scalable. +In addition, updaters can make useful forward progress concurrently +with readers. +However, all this concurrency between RCU readers and updaters does raise +the question of exactly what RCU readers are doing, which in turn +raises the question of exactly what RCU's requirements are. + +

+This document therefore summarizes RCU's requirements, and can be thought +of as an informal, high-level specification for RCU. +It is important to understand that RCU's specification is primarily +empirical in nature; +in fact, I learned about many of these requirements the hard way. +This situation might cause some consternation, however, not only +has this learning process been a lot of fun, but it has also been +a great privilege to work with so many people willing to apply +technologies in interesting new ways. + +

+All that aside, here are the categories of currently known RCU requirements: +

+ +
    +
  1. + Fundamental Requirements +
  2. Fundamental Non-Requirements +
  3. + Parallelism Facts of Life +
  4. + Quality-of-Implementation Requirements +
  5. + Linux Kernel Complications +
  6. + Software-Engineering Requirements +
  7. + Other RCU Flavors +
  8. + Possible Future Changes +
+ +

+This is followed by a summary, +which is in turn followed by the inevitable +answers to the quick quizzes. + +

Fundamental Requirements

+ +

+RCU's fundamental requirements are the closest thing RCU has to hard +mathematical requirements. +These are: + +

    +
  1. + Grace-Period Guarantee +
  2. + Publish-Subscribe Guarantee +
  3. + Memory-Barrier Guarantees +
  4. + RCU Primitives Guaranteed to Execute Unconditionally +
  5. + Guaranteed Read-to-Write Upgrade +
+ +

Grace-Period Guarantee

+ +

+RCU's grace-period guarantee is unusual in being premeditated: +Jack Slingwine and I had this guarantee firmly in mind when we started +work on RCU (then called “rclock”) in the early 1990s. +That said, the past two decades of experience with RCU have produced +a much more detailed understanding of this guarantee. + +

+RCU's grace-period guarantee allows updaters to wait for the completion +of all pre-existing RCU read-side critical sections. +An RCU read-side critical section +begins with the marker rcu_read_lock() and ends with +the marker rcu_read_unlock(). +These markers may be nested, and RCU treats a nested set as one +big RCU read-side critical section. +Production-quality implementations of rcu_read_lock() and +rcu_read_unlock() are extremely lightweight, and in +fact have exactly zero overhead in Linux kernels built for production +use with CONFIG_PREEMPT=n. + +

+This guarantee allows ordering to be enforced with extremely low +overhead to readers, for example: + +

+
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5   rcu_read_lock();
+ 6   r1 = READ_ONCE(x);
+ 7   r2 = READ_ONCE(y);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   WRITE_ONCE(x, 1);
+14   synchronize_rcu();
+15   WRITE_ONCE(y, 1);
+16 }
+
+
+ +

+Because the synchronize_rcu() on line 14 waits for +all pre-existing readers, any instance of thread0() that +loads a value of zero from x must complete before +thread1() stores to y, so that instance must +also load a value of zero from y. +Similarly, any instance of thread0() that loads a value of +one from y must have started after the +synchronize_rcu() started, and must therefore also load +a value of one from x. +Therefore, the outcome: +

+
+(r1 == 0 && r2 == 1)
+
+
+cannot happen. + +

Quick Quiz 1: +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +synchronize_rcu()!!! +Just who are you trying to fool??? +
Answer + +

+This scenario resembles one of the first uses of RCU in +DYNIX/ptx, +which managed a distributed lock manager's transition into +a state suitable for handling recovery from node failure, +more or less as follows: + +

+
+ 1 #define STATE_NORMAL        0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING    2
+ 4 #define STATE_WANT_NORMAL   3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10   int state_snap;
+11
+12   rcu_read_lock();
+13   state_snap = READ_ONCE(state);
+14   if (state_snap == STATE_NORMAL)
+15     do_something();
+16   else
+17     do_something_carefully();
+18   rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24   synchronize_rcu();
+25   WRITE_ONCE(state, STATE_RECOVERING);
+26   recovery();
+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
+28   synchronize_rcu();
+29   WRITE_ONCE(state, STATE_NORMAL);
+30 }
+
+
+ +

+The RCU read-side critical section in do_something_dlm() +works with the synchronize_rcu() in start_recovery() +to guarantee that do_something() never runs concurrently +with recovery(), but with little or no synchronization +overhead in do_something_dlm(). + +

Quick Quiz 2: +Why is the synchronize_rcu() on line 28 needed? +
Answer + +

+In order to avoid fatal problems such as deadlocks, +an RCU read-side critical section must not contain calls to +synchronize_rcu(). +Similarly, an RCU read-side critical section must not +contain anything that waits, directly or indirectly, on completion of +an invocation of synchronize_rcu(). + +

+Although RCU's grace-period guarantee is useful in and of itself, with +quite a few use cases, +it would be good to be able to use RCU to coordinate read-side +access to linked data structures. +For this, the grace-period guarantee is not sufficient, as can +be seen in function add_gp_buggy() below. +We will look at the reader's code later, but in the meantime, just think of +the reader as locklessly picking up the gp pointer, +and, if the value loaded is non-NULL, locklessly accessing the +->a and ->b fields. + +

+
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   gp = p; /* ORDERING BUG */
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The problem is that both the compiler and weakly ordered CPUs are within +their rights to reorder this code as follows: + +

+
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   gp = p; /* ORDERING BUG */
+12   p->a = a;
+13   p->b = a;
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+If an RCU reader fetches gp just after +add_gp_buggy_optimized executes line 11, +it will see garbage in the ->a and ->b +fields. +And this is but one of many ways in which compiler and hardware optimizations +could cause trouble. +Therefore, we clearly need some way to prevent the compiler and the CPU from +reordering in this manner, which brings us to the publish-subscribe +guarantee discussed in the next section. + +

Publish/Subscribe Guarantee

+ +

+RCU's publish-subscribe guarantee allows data to be inserted +into a linked data structure without disrupting RCU readers. +The updater uses rcu_assign_pointer() to insert the +new data, and readers use rcu_dereference() to +access data, whether new or old. +The following shows an example of insertion: + +

+
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   rcu_assign_pointer(gp, p);
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The rcu_assign_pointer() on line 13 is conceptually +equivalent to a simple assignment statement, but also guarantees +that its assignment will +happen after the two assignments in lines 11 and 12, +similar to the C11 memory_order_release store operation. +It also prevents any number of “interesting” compiler +optimizations, for example, the use of gp as a scratch +location immediately preceding the assignment. + +

Quick Quiz 3: +But rcu_assign_pointer() does nothing to prevent the +two assignments to p->a and p->b +from being reordered. +Can't that also cause problems? +
Answer + +

+It is tempting to assume that the reader need not do anything special +to control its accesses to the RCU-protected data, +as shown in do_something_gp_buggy() below: + +

+
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+However, this temptation must be resisted because there are a +surprisingly large number of ways that the compiler +(to say nothing of +DEC Alpha CPUs) +can trip this code up. +For but one example, if the compiler were short of registers, it +might choose to refetch from gp rather than keeping +a separate copy in p as follows: + +

+
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
+ 5     do_something(gp->a, gp->b);
+ 6     rcu_read_unlock();
+ 7     return true;
+ 8   }
+ 9   rcu_read_unlock();
+10   return false;
+11 }
+
+
+ +

+If this function ran concurrently with a series of updates that +replaced the current structure with a new one, +the fetches of gp->a +and gp->b might well come from two different structures, +which could cause serious confusion. +To prevent this (and much else besides), do_something_gp() uses +rcu_dereference() to fetch from gp: + +

+
+ 1 bool do_something_gp(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = rcu_dereference(gp);
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+The rcu_dereference() uses volatile casts and (for DEC Alpha) +memory barriers in the Linux kernel. +Should a +high-quality implementation of C11 memory_order_consume [PDF] +ever appear, then rcu_dereference() could be implemented +as a memory_order_consume load. +Regardless of the exact implementation, a pointer fetched by +rcu_dereference() may not be used outside of the +outermost RCU read-side critical section containing that +rcu_dereference(), unless protection of +the corresponding data element has been passed from RCU to some +other synchronization mechanism, most commonly locking or +reference counting. + +

+In short, updaters use rcu_assign_pointer() and readers +use rcu_dereference(), and these two RCU API elements +work together to ensure that readers have a consistent view of +newly added data elements. + +

+Of course, it is also necessary to remove elements from RCU-protected +data structures, for example, using the following process: + +

    +
  1. Remove the data element from the enclosing structure. +
  2. Wait for all pre-existing RCU read-side critical sections + to complete (because only pre-existing readers can possibly have + a reference to the newly removed data element). +
  3. At this point, only the updater has a reference to the + newly removed data element, so it can safely reclaim + the data element, for example, by passing it to kfree(). +
+ +This process is implemented by remove_gp_synchronous(): + +
+
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3   struct foo *p;
+ 4
+ 5   spin_lock(&gp_lock);
+ 6   p = rcu_access_pointer(gp);
+ 7   if (!p) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   rcu_assign_pointer(gp, NULL);
+12   spin_unlock(&gp_lock);
+13   synchronize_rcu();
+14   kfree(p);
+15   return true;
+16 }
+
+
+ +

+This function is straightforward, with line 13 waiting for a grace +period before line 14 frees the old data element. +This waiting ensures that readers will reach line 7 of +do_something_gp() before the data element referenced by +p is freed. +The rcu_access_pointer() on line 6 is similar to +rcu_dereference(), except that: + +

    +
  1. The value returned by rcu_access_pointer() + cannot be dereferenced. + If you want to access the value pointed to as well as + the pointer itself, use rcu_dereference() + instead of rcu_access_pointer(). +
  2. The call to rcu_access_pointer() need not be + protected. + In contrast, rcu_dereference() must either be + within an RCU read-side critical section or in a code + segment where the pointer cannot change, for example, in + code protected by the corresponding update-side lock. +
+ +

Quick Quiz 4: +Without the rcu_dereference() or the +rcu_access_pointer(), what destructive optimizations +might the compiler make use of? +
Answer + +

+In short, RCU's publish-subscribe guarantee is provided by the combination +of rcu_assign_pointer() and rcu_dereference(). +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +

+This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling rcu_dereference() for subscription, nor did it +have anything resembling the smp_read_barrier_depends() +that was later subsumed into rcu_dereference(). +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good two hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting rcu_dereference()! + +

Memory-Barrier Guarantees

+ +

+The previous section's simple linked-data-structure scenario clearly +demonstrates the need for RCU's stringent memory-ordering guarantees on +systems with more than one CPU: + +

    +
  1. Each CPU that has an RCU read-side critical section that + begins before synchronize_rcu() starts is + guaranteed to execute a full memory barrier between the time + that the RCU read-side critical section ends and the time that + synchronize_rcu() returns. + Without this guarantee, a pre-existing RCU read-side critical section + might hold a reference to the newly removed struct foo + after the kfree() on line 14 of + remove_gp_synchronous(). +
  2. Each CPU that has an RCU read-side critical section that ends + after synchronize_rcu() returns is guaranteed + to execute a full memory barrier between the time that + synchronize_rcu() begins and the time that the RCU + read-side critical section begins. + Without this guarantee, a later RCU read-side critical section + running after the kfree() on line 14 of + remove_gp_synchronous() might + later run do_something_gp() and find the + newly deleted struct foo. +
  3. If the task invoking synchronize_rcu() remains + on a given CPU, then that CPU is guaranteed to execute a full + memory barrier sometime during the execution of + synchronize_rcu(). + This guarantee ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on line 11. +
  4. If the task invoking synchronize_rcu() migrates + among a group of CPUs during that invocation, then each of the + CPUs in that group is guaranteed to execute a full memory barrier + sometime during the execution of synchronize_rcu(). + This guarantee also ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on + line 11, but also in the case where the thread executing the + synchronize_rcu() migrates in the meantime. +
+ +

Quick Quiz 5: +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of synchronize_rcu()? +
Answer + +

Quick Quiz 6: +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers really required? +
Answer + +

+Note that these memory-barrier requirements do not replace the fundamental +RCU requirement that a grace period wait for all pre-existing readers. +On the contrary, the memory barriers called out in this section must operate in +such a way as to enforce this fundamental requirement. +Of course, different implementations enforce this requirement in different +ways, but enforce it they must. + +

RCU Primitives Guaranteed to Execute Unconditionally

+ +

+The common-case RCU primitives are unconditional. +They are invoked, they do their job, and they return, with no possibility +of error, and no need to retry. +This is a key RCU design philosophy. + +

+However, this philosophy is pragmatic rather than pigheaded. +If someone comes up with a good justification for a particular conditional +RCU primitive, it might well be implemented and added. +After all, this guarantee was reverse-engineered, not premeditated. +The unconditional nature of the RCU primitives was initially an +accident of implementation, and later experience with synchronization +primitives with conditional primitives caused me to elevate this +accident to a guarantee. +Therefore, the justification for adding a conditional primitive to +RCU would need to be based on detailed and compelling use cases. + +

Guaranteed Read-to-Write Upgrade

+ +

+As far as RCU is concerned, it is always possible to carry out an +update within an RCU read-side critical section. +For example, that RCU read-side critical section might search for +a given data element, and then might acquire the update-side +spinlock in order to update that element, all while remaining +in that RCU read-side critical section. +Of course, it is necessary to exit the RCU read-side critical section +before invoking synchronize_rcu(), however, this +inconvenience can be avoided through use of the +call_rcu() and kfree_rcu() API members +described later in this document. + +

Quick Quiz 7: +But how does the upgrade-to-write operation exclude other readers? +
Answer + +

+This guarantee allows lookup code to be shared between read-side +and update-side code, and was premeditated, appearing in the earliest +DYNIX/ptx RCU documentation. + +

Fundamental Non-Requirements

+ +

+RCU provides extremely lightweight readers, and its read-side guarantees, +though quite useful, are correspondingly lightweight. +It is therefore all too easy to assume that RCU is guaranteeing more +than it really is. +Of course, the list of things that RCU does not guarantee is infinitely +long, however, the following sections list a few non-guarantees that +have caused confusion. +Except where otherwise noted, these non-guarantees were premeditated. + +

    +
  1. + Readers Impose Minimal Ordering +
  2. + Readers Do Not Exclude Updaters +
  3. + Updaters Only Wait For Old Readers +
  4. + Grace Periods Don't Partition Read-Side Critical Sections +
  5. + Read-Side Critical Sections Don't Partition Grace Periods +
  6. + Disabling Preemption Does Not Block Grace Periods +
+ +

Readers Impose Minimal Ordering

+ +

+Reader-side markers such as rcu_read_lock() and +rcu_read_unlock() provide absolutely no ordering guarantees +except through their interaction with the grace-period APIs such as +synchronize_rcu(). +To see this, consider the following pair of threads: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(x, 1);
+ 5   rcu_read_unlock();
+ 6   rcu_read_lock();
+ 7   WRITE_ONCE(y, 1);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   rcu_read_lock();
+14   r1 = READ_ONCE(y);
+15   rcu_read_unlock();
+16   rcu_read_lock();
+17   r2 = READ_ONCE(x);
+18   rcu_read_unlock();
+19 }
+
+
+ +

+After thread0() and thread1() execute +concurrently, it is quite possible to have + +

+
+(r1 == 1 && r2 == 0)
+
+
+ +(that is, y appears to have been assigned before x), +which would not be possible if rcu_read_lock() and +rcu_read_unlock() had much in the way of ordering +properties. +But they do not, so the CPU is within its rights +to do significant reordering. +This is by design: Any significant ordering constraints would slow down +these fast-path APIs. + +

Quick Quiz 8: +Can't the compiler also reorder this code? +
Answer + +

Readers Do Not Exclude Updaters

+ +

+Neither rcu_read_lock() nor rcu_read_unlock() +exclude updates. +All they do is to prevent grace periods from ending. +The following example illustrates this: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   r1 = READ_ONCE(y);
+ 5   if (r1) {
+ 6     do_something_with_nonzero_x();
+ 7     r2 = READ_ONCE(x);
+ 8     WARN_ON(!r2); /* BUG!!! */
+ 9   }
+10   rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15   spin_lock(&my_lock);
+16   WRITE_ONCE(x, 1);
+17   WRITE_ONCE(y, 1);
+18   spin_unlock(&my_lock);
+19 }
+
+
+ +

+If the thread0() function's rcu_read_lock() +excluded the thread1() function's update, +the WARN_ON() could never fire. +But the fact is that rcu_read_lock() does not exclude +much of anything aside from subsequent grace periods, of which +thread1() has none, so the +WARN_ON() can and does fire. + +

Updaters Only Wait For Old Readers

+ +

+It might be tempting to assume that after synchronize_rcu() +completes, there are no readers executing. +This temptation must be avoided because +new readers can start immediately after synchronize_rcu() +starts, and synchronize_rcu() is under no +obligation to wait for these new readers. + +

Quick Quiz 9: +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? +
Answer + +

+Grace Periods Don't Partition Read-Side Critical Sections

+ +

+It is tempting to assume that if any part of one RCU read-side critical +section precedes a given grace period, and if any part of another RCU +read-side critical section follows that same grace period, then all of +the first RCU read-side critical section must precede all of the second. +However, this just isn't the case: A single grace period does not +partition the set of RCU read-side critical sections. +An example of this situation can be illustrated as follows, where +x, y, and z are initially all zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   r2 = READ_ONCE(b);
+20   r3 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+
+
+ +

+It turns out that the outcome: + +

+
+(r1 == 1 && r2 == 0 && r3 == 1)
+
+
+ +is entirely possible. +The following figure show how this can happen, with each circled +QS indicating the point at which RCU recorded a +quiescent state for each thread, that is, a state in which +RCU knows that the thread cannot be in the midst of an RCU read-side +critical section that started before the current grace period: + +

GPpartitionReaders1.svg

+ +

+If it is necessary to partition RCU read-side critical sections in this +manner, it is necessary to use two grace periods, where the first +grace period is known to end before the second grace period starts: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   r2 = READ_ONCE(c);
+19   synchronize_rcu();
+20   WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25   rcu_read_lock();
+26   r3 = READ_ONCE(b);
+27   r4 = READ_ONCE(d);
+28   rcu_read_unlock();
+29 }
+
+
+ +

+Here, if (r1 == 1), then +thread0()'s write to b must happen +before the end of thread1()'s grace period. +If in addition (r4 == 1), then +thread3()'s read from b must happen +after the beginning of thread2()'s grace period. +If it is also the case that (r2 == 1), then the +end of thread1()'s grace period must precede the +beginning of thread2()'s grace period. +This mean that the two RCU read-side critical sections cannot overlap, +guaranteeing that (r3 == 1). +As a result, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
+
+
+ +cannot happen. + +

+This non-requirement was also non-premeditated, but became apparent +when studying RCU's interaction with memory ordering. + +

+Read-Side Critical Sections Don't Partition Grace Periods

+ +

+It is also tempting to assume that if an RCU read-side critical section +happens between a pair of grace periods, then those grace periods cannot +overlap. +However, this temptation leads nowhere good, as can be illustrated by +the following, with all variables initially zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   WRITE_ONCE(d, 1);
+20   r2 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26   r3 = READ_ONCE(d);
+27   synchronize_rcu();
+28   WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33   rcu_read_lock();
+34   r4 = READ_ONCE(b);
+35   r5 = READ_ONCE(e);
+36   rcu_read_unlock();
+37 }
+
+
+ +

+In this case, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
+
+
+ +is entirely possible, as illustrated below: + +

ReadersPartitionGP1.svg

+ +

+Again, an RCU read-side critical section can overlap almost all of a +given grace period, just so long as it does not overlap the entire +grace period. +As a result, an RCU read-side critical section cannot partition a pair +of RCU grace periods. + +

Quick Quiz 10: +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? +
Answer + +

+Disabling Preemption Does Not Block Grace Periods

+ +

+There was a time when disabling preemption on any given CPU would block +subsequent grace periods. +However, this was an accident of implementation and is not a requirement. +And in the current Linux-kernel implementation, disabling preemption +on a given CPU in fact does not block grace periods, as Oleg Nesterov +demonstrated. + +

+If you need a preempt-disable region to block grace periods, you need to add +rcu_read_lock() and rcu_read_unlock(), for example +as follows: + +

+
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&mylock);
+
+
+ +

+In theory, you could enter the RCU read-side critical section first, +but it is more efficient to keep the entire RCU read-side critical +section contained in the preempt-disable region as shown above. +Of course, RCU read-side critical sections that extend outside of +preempt-disable regions will work correctly, but such critical sections +can be preempted, which forces rcu_read_unlock() to do +more work. +And no, this is not an invitation to enclose all of your RCU +read-side critical sections within preempt-disable regions, because +doing so would degrade real-time response. + +

+This non-requirement appeared with preemptible RCU. +If you need a grace period that waits on non-preemptible code regions, use +RCU-sched. + +

Parallelism Facts of Life

+ +

+These parallelism facts of life are by no means specific to RCU, but +the RCU implementation must abide by them. +They therefore bear repeating: + +

    +
  1. Any CPU or task may be delayed at any time, + and any attempts to avoid these delays by disabling + preemption, interrupts, or whatever are completely futile. + This is most obvious in preemptible user-level + environments and in virtualized environments (where + a given guest OS's VCPUs can be preempted at any time by + the underlying hypervisor), but can also happen in bare-metal + environments due to ECC errors, NMIs, and other hardware + events. + Although a delay of more than about 20 seconds can result + in splats, the RCU implementation is obligated to use + algorithms that can tolerate extremely long delays, but where + “extremely long” is not long enough to allow + wrap-around when incrementing a 64-bit counter. +
  2. Both the compiler and the CPU can reorder memory accesses. + Where it matters, RCU must use compiler directives and + memory-barrier instructions to preserve ordering. +
  3. Conflicting writes to memory locations in any given cache line + will result in expensive cache misses. + Greater numbers of concurrent writes and more-frequent + concurrent writes will result in more dramatic slowdowns. + RCU is therefore obligated to use algorithms that have + sufficient locality to avoid significant performance and + scalability problems. +
  4. As a rough rule of thumb, only one CPU's worth of processing + may be carried out under the protection of any given exclusive + lock. + RCU must therefore use scalable locking designs. +
  5. Counters are finite, especially on 32-bit systems. + RCU's use of counters must therefore tolerate counter wrap, + or be designed such that counter wrap would take way more + time than a single system is likely to run. + An uptime of ten years is quite possible, a runtime + of a century much less so. + As an example of the latter, RCU's dyntick-idle nesting counter + allows 54 bits for interrupt nesting level (this counter + is 64 bits even on a 32-bit system). + Overflowing this counter requires 254 + half-interrupts on a given CPU without that CPU ever going idle. + If a half-interrupt happened every microsecond, it would take + 570 years of runtime to overflow this counter, which is currently + believed to be an acceptably long time. +
  6. Linux systems can have thousands of CPUs running a single + Linux kernel in a single shared-memory environment. + RCU must therefore pay close attention to high-end scalability. +
+ +

+This last parallelism fact of life means that RCU must pay special +attention to the preceding facts of life. +The idea that Linux might scale to systems with thousands of CPUs would +have been met with some skepticism in the 1990s, but these requirements +would have otherwise have been unsurprising, even in the early 1990s. + +

Quality-of-Implementation Requirements

+ +

+These sections list quality-of-implementation requirements. +Although an RCU implementation that ignores these requirements could +still be used, it would likely be subject to limitations that would +make it inappropriate for industrial-strength production use. +Classes of quality-of-implementation requirements are as follows: + +

    +
  1. Specialization +
  2. Performance and Scalability +
  3. Composability +
  4. Corner Cases +
+ +

+These classes is covered in the following sections. + +

Specialization

+ +

+RCU is and always has been intended primarily for read-mostly situations, as +illustrated by the following figure. +This means that RCU's read-side primitives are optimized, often at the +expense of its update-side primitives. + +

RCUApplicability.svg

+ +

+This focus on read-mostly situations means that RCU must interoperate +with other synchronization primitives. +For example, the add_gp() and remove_gp_synchronous() +examples discussed earlier use RCU to protect readers and locking to +coordinate updaters. +However, the need extends much farther, requiring that a variety of +synchronization primitives be legal within RCU read-side critical sections, +including spinlocks, sequence locks, atomic operations, reference +counters, and memory barriers. + +

Quick Quiz 11: +What about sleeping locks? +
Answer + +

+It often comes as a surprise that many algorithms do not require a +consistent view of data, but many can function in that mode, +with network routing being the poster child. +Internet routing algorithms take significant time to propagate +updates, so that by the time an update arrives at a given system, +that system has been sending network traffic the wrong way for +a considerable length of time. +Having a few threads continue to send traffic the wrong way for a +few more milliseconds is clearly not a problem: In the worst case, +TCP retransmissions will eventually get the data where it needs to go. +In general, when tracking the state of the universe outside of the +computer, some level of inconsistency must be tolerated due to +speed-of-light delays if nothing else. + +

+Furthermore, uncertainty about external state is inherent in many cases. +For example, a pair of veternarians might use heartbeat to determine +whether or not a given cat was alive. +But how long should they wait after the last heartbeat to decide that +the cat is in fact dead? +Waiting less than 400 milliseconds makes no sense because this would +mean that a relaxed cat would be considered to cycle between death +and life more than 100 times per minute. +Moreover, just as with human beings, a cat's heart might stop for +some period of time, so the exact wait period is a judgment call. +One of our pair of veternarians might wait 30 seconds before pronouncing +the cat dead, while the other might insist on waiting a full minute. +The two veternarians would then disagree on the state of the cat during +the final 30 seconds of the minute following the last heartbeat, as +fancifully illustrated below: + +

2013-08-is-it-dead.png

+ +

+Interestingly enough, this same situation applies to hardware. +When push comes to shove, how do we tell whether or not some +external server has failed? +We send messages to it periodically, and declare it failed if we +don't receive a response within a given period of time. +Policy decisions can usually tolerate short +periods of inconsistency. +The policy was decided some time ago, and is only now being put into +effect, so a few milliseconds of delay is normally inconsequential. + +

+However, there are algorithms that absolutely must see consistent data. +For example, the translation between a user-level SystemV semaphore +ID to the corresponding in-kernel data structure is protected by RCU, +but it is absolutely forbidden to update a semaphore that has just been +removed. +In the Linux kernel, this need for consistency is accommodated by acquiring +spinlocks located in the in-kernel data structure from within +the RCU read-side critical section, and this is indicated by the +green box in the figure above. +Many other techniques may be used, and are in fact used within the +Linux kernel. + +

+In short, RCU is not required to maintain consistency, and other +mechanisms may be used in concert with RCU when consistency is required. +RCU's specialization allows it to do its job extremely well, and its +ability to interoperate with other synchronization mechanisms allows +the right mix of synchronization tools to be used for a given job. + +

Performance and Scalability

+ +

+Energy efficiency is a critical component of performance today, +and Linux-kernel RCU implementations must therefore avoid unnecessarily +awakening idle CPUs. +I cannot claim that this requirement was premeditated. +In fact, I learned of it during a telephone conversation in which I +was given “frank and open” feedback on the importance +of energy efficiency in battery-powered systems and on specific +energy-efficiency shortcomings of the Linux-kernel RCU implementation. +In my experience, the battery-powered embedded community will consider +any unnecessary wakeups to be extremely unfriendly acts. +So much so that mere Linux-kernel-mailing-list posts are +insufficient to vent their ire. + +

+Memory consumption is not particularly important for in most +situations, and has become decreasingly +so as memory sizes have expanded and memory +costs have plummeted. +However, as I learned from Matt Mackall's +bloatwatch +efforts, memory footprint is critically important on single-CPU systems with +non-preemptible (CONFIG_PREEMPT=n) kernels, and thus +tiny RCU +was born. +Josh Triplett has since taken over the small-memory banner with his +Linux kernel tinification +project, which resulted in +SRCU +becoming optional for those kernels not needing it. + +

+The remaining performance requirements are, for the most part, +unsurprising. +For example, in keeping with RCU's read-side specialization, +rcu_dereference() should have negligible overhead (for +example, suppression of a few minor compiler optimizations). +Similarly, in non-preemptible environments, rcu_read_lock() and +rcu_read_unlock() should have exactly zero overhead. + +

+In preemptible environments, in the case where the RCU read-side +critical section was not preempted (as will be the case for the +highest-priority real-time process), rcu_read_lock() and +rcu_read_unlock() should have minimal overhead. +In particular, they should not contain atomic read-modify-write +operations, memory-barrier instructions, preemption disabling, +interrupt disabling, or backwards branches. +However, in the case where the RCU read-side critical section was preempted, +rcu_read_unlock() may acquire spinlocks and disable interrupts. +This is why it is better to nest an RCU read-side critical section +within a preempt-disable region than vice versa, at least in cases +where that critical section is short enough to avoid unduly degrading +real-time latencies. + +

+The synchronize_rcu() grace-period-wait primitive is +optimized for throughput. +It may therefore incur several milliseconds of latency in addition to +the duration of the longest RCU read-side critical section. +On the other hand, multiple concurrent invocations of +synchronize_rcu() are required to use batching optimizations +so that they can be satisfied by a single underlying grace-period-wait +operation. +For example, in the Linux kernel, it is not unusual for a single +grace-period-wait operation to serve more than +1,000 separate invocations +of synchronize_rcu(), thus amortizing the per-invocation +overhead down to nearly zero. +However, the grace-period optimization is also required to avoid +measurable degradation of real-time scheduling and interrupt latencies. + +

+In some cases, the multi-millisecond synchronize_rcu() +latencies are unacceptable. +In these cases, synchronize_rcu_expedited() may be used +instead, reducing the grace-period latency down to a few tens of +microseconds on small systems, at least in cases where the RCU read-side +critical sections are short. +There are currently no special latency requirements for +synchronize_rcu_expedited() on large systems, but, +consistent with the empirical nature of the RCU specification, +that is subject to change. +However, there most definitely are scalability requirements: +A storm of synchronize_rcu_expedited() invocations on 4096 +CPUs should at least make reasonable forward progress. +In return for its shorter latencies, synchronize_rcu_expedited() +is permitted to impose modest degradation of real-time latency +on non-idle online CPUs. +That said, it will likely be necessary to take further steps to reduce this +degradation, hopefully to roughly that of a scheduling-clock interrupt. + +

+There are a number of situations where even +synchronize_rcu_expedited()'s reduced grace-period +latency is unacceptable. +In these situations, the asynchronous call_rcu() can be +used in place of synchronize_rcu() as follows: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9   struct foo *p = container_of(rhp, struct foo, rh);
+10
+11   kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16   struct foo *p;
+17
+18   spin_lock(&gp_lock);
+19   p = rcu_dereference(gp);
+20   if (!p) {
+21     spin_unlock(&gp_lock);
+22     return false;
+23   }
+24   rcu_assign_pointer(gp, NULL);
+25   call_rcu(&p->rh, remove_gp_cb);
+26   spin_unlock(&gp_lock);
+27   return true;
+28 }
+
+
+ +

+A definition of struct foo is finally needed, and appears +on lines 1-5. +The function remove_gp_cb() is passed to call_rcu() +on line 25, and will be invoked after the end of a subsequent +grace period. +This gets the same effect as remove_gp_synchronous(), +but without forcing the updater to wait for a grace period to elapse. +The call_rcu() function may be used in a number of +situations where neither synchronize_rcu() nor +synchronize_rcu_expedited() would be legal, +including within preempt-disable code, local_bh_disable() code, +interrupt-disable code, and interrupt handlers. +However, even call_rcu() is illegal within NMI handlers. +The callback function (remove_gp_cb() in this case) will be +executed within softirq (software interrupt) environment within the +Linux kernel, +either within a real softirq handler or under the protection +of local_bh_disable(). +In both the Linux kernel and in userspace, it is bad practice to +write an RCU callback function that takes too long. +Long-running operations should be relegated to separate threads or +(in the Linux kernel) workqueues. + +

Quick Quiz 12: +Why does line 19 use rcu_access_pointer()? +After all, call_rcu() on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that rcu_dereference() is required? +
Answer + +

+However, all that remove_gp_cb() is doing is +invoking kfree() on the data element. +This is a common idiom, and is supported by kfree_rcu(), +which allows “fire and forget” operation as shown below: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9   struct foo *p;
+10
+11   spin_lock(&gp_lock);
+12   p = rcu_dereference(gp);
+13   if (!p) {
+14     spin_unlock(&gp_lock);
+15     return false;
+16   }
+17   rcu_assign_pointer(gp, NULL);
+18   kfree_rcu(p, rh);
+19   spin_unlock(&gp_lock);
+20   return true;
+21 }
+
+
+ +

+Note that remove_gp_faf() simply invokes +kfree_rcu() and proceeds, without any need to pay any +further attention to the subsequent grace period and kfree(). +It is permissible to invoke kfree_rcu() from the same +environments as for call_rcu(). +Interestingly enough, DYNIX/ptx had the equivalents of +call_rcu() and kfree_rcu(), but not +synchronize_rcu(). +This was due to the fact that RCU was not heavily used within DYNIX/ptx, +so the very few places that needed something like +synchronize_rcu() simply open-coded it. + +

Quick Quiz 13: +Earlier it was claimed that call_rcu() and +kfree_rcu() allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? +
Answer + +

+But what if the updater must wait for the completion of code to be +executed after the end of the grace period, but has other tasks +that can be carried out in the meantime? +The polling-style get_state_synchronize_rcu() and +cond_synchronize_rcu() functions may be used for this +purpose, as shown below: + +

+
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3   struct foo *p;
+ 4   unsigned long s;
+ 5
+ 6   spin_lock(&gp_lock);
+ 7   p = rcu_access_pointer(gp);
+ 8   if (!p) {
+ 9     spin_unlock(&gp_lock);
+10     return false;
+11   }
+12   rcu_assign_pointer(gp, NULL);
+13   spin_unlock(&gp_lock);
+14   s = get_state_synchronize_rcu();
+15   do_something_while_waiting();
+16   cond_synchronize_rcu(s);
+17   kfree(p);
+18   return true;
+19 }
+
+
+ +

+On line 14, get_state_synchronize_rcu() obtains a +“cookie” from RCU, +then line 15 carries out other tasks, +and finally, line 16 returns immediately if a grace period has +elapsed in the meantime, but otherwise waits as required. +The need for get_state_synchronize_rcu and +cond_synchronize_rcu() has appeared quite recently, +so it is too early to tell whether they will stand the test of time. + +

+RCU thus provides a range of tools to allow updaters to strike the +required tradeoff between latency, flexibility and CPU overhead. + +

Composability

+ +

+Composability has received much attention in recent years, perhaps in part +due to the collision of multicore hardware with object-oriented techniques +designed in single-threaded environments for single-threaded use. +And in theory, RCU read-side critical sections may be composed, and in +fact may be nested arbitrarily deeply. +In practice, as with all real-world implementations of composable +constructs, there are limitations. + +

+Implementations of RCU for which rcu_read_lock() +and rcu_read_unlock() generate no code, such as +Linux-kernel RCU when CONFIG_PREEMPT=n, can be +nested arbitrarily deeply. +After all, there is no overhead. +Except that if all these instances of rcu_read_lock() +and rcu_read_unlock() are visible to the compiler, +compilation will eventually fail due to exhausting memory, +mass storage, or user patience, whichever comes first. +If the nesting is not visible to the compiler, as is the case with +mutually recursive functions each in its own translation unit, +stack overflow will result. +If the nesting takes the form of loops, either the control variable +will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. +Nevertheless, this class of RCU implementations is one +of the most composable constructs in existence. + +

+RCU implementations that explicitly track nesting depth +are limited by the nesting-depth counter. +For example, the Linux kernel's preemptible RCU limits nesting to +INT_MAX. +This should suffice for almost all practical purposes. +That said, a consecutive pair of RCU read-side critical sections +between which there is an operation that waits for a grace period +cannot be enclosed in another RCU read-side critical section. +This is because it is not legal to wait for a grace period within +an RCU read-side critical section: To do so would result either +in deadlock or +in RCU implicitly splitting the enclosing RCU read-side critical +section, neither of which is conducive to a long-lived and prosperous +kernel. + +

+It is worth noting that RCU is not alone in limiting composability. +For example, many transactional-memory implementations prohibit +composing a pair of transactions separated by an irrevocable +operation (for example, a network receive operation). +For another example, lock-based critical sections can be composed +surprisingly freely, but only if deadlock is avoided. + +

+In short, although RCU read-side critical sections are highly composable, +care is required in some situations, just as is the case for any other +composable synchronization mechanism. + +

Corner Cases

+ +

+A given RCU workload might have an endless and intense stream of +RCU read-side critical sections, perhaps even so intense that there +was never a point in time during which there was not at least one +RCU read-side critical section in flight. +RCU cannot allow this situation to block grace periods: As long as +all the RCU read-side critical sections are finite, grace periods +must also be finite. + +

+That said, preemptible RCU implementations could potentially result +in RCU read-side critical sections being preempted for long durations, +which has the effect of creating a long-duration RCU read-side +critical section. +This situation can arise only in heavily loaded systems, but systems using +real-time priorities are of course more vulnerable. +Therefore, RCU priority boosting is provided to help deal with this +case. +That said, the exact requirements on RCU priority boosting will likely +evolve as more experience accumulates. + +

+Other workloads might have very high update rates. +Although one can argue that such workloads should instead use +something other than RCU, the fact remains that RCU must +handle such workloads gracefully. +This requirement is another factor driving batching of grace periods, +but it is also the driving force behind the checks for large numbers +of queued RCU callbacks in the call_rcu() code path. +Finally, high update rates should not delay RCU read-side critical +sections, although some read-side delays can occur when using +synchronize_rcu_expedited(), courtesy of this function's use +of try_stop_cpus(). +(In the future, synchronize_rcu_expedited() will be +converted to use lighter-weight inter-processor interrupts (IPIs), +but this will still disturb readers, though to a much smaller degree.) + +

+Although all three of these corner cases were understood in the early +1990s, a simple user-level test consisting of close(open(path)) +in a tight loop +in the early 2000s suddenly provided a much deeper appreciation of the +high-update-rate corner case. +This test also motivated addition of some RCU code to react to high update +rates, for example, if a given CPU finds itself with more than 10,000 +RCU callbacks queued, it will cause RCU to take evasive action by +more aggressively starting grace periods and more aggressively forcing +completion of grace-period processing. +This evasive action causes the grace period to complete more quickly, +but at the cost of restricting RCU's batching optimizations, thus +increasing the CPU overhead incurred by that grace period. + +

+Software-Engineering Requirements

+ +

+Between Murphy's Law and “To err is human”, it is necessary to +guard against mishaps and misuse: + +

    +
  1. It is all too easy to forget to use rcu_read_lock() + everywhere that it is needed, so kernels built with + CONFIG_PROVE_RCU=y will spat if + rcu_dereference() is used outside of an + RCU read-side critical section. + Update-side code can use rcu_dereference_protected(), + which takes a + lockdep expression + to indicate what is providing the protection. + If the indicated protection is not provided, a lockdep splat + is emitted. + +

    + Code shared between readers and updaters can use + rcu_dereference_check(), which also takes a + lockdep expression, and emits a lockdep splat if neither + rcu_read_lock() nor the indicated protection + is in place. + In addition, rcu_dereference_raw() is used in those + (hopefully rare) cases where the required protection cannot + be easily described. + Finally, rcu_read_lock_held() is provided to + allow a function to verify that it has been invoked within + an RCU read-side critical section. + I was made aware of this set of requirements shortly after Thomas + Gleixner audited a number of RCU uses. +

  2. A given function might wish to check for RCU-related preconditions + upon entry, before using any other RCU API. + The rcu_lockdep_assert() does this job, + asserting the expression in kernels having lockdep enabled + and doing nothing otherwise. +
  3. It is also easy to forget to use rcu_assign_pointer() + and rcu_dereference(), perhaps (incorrectly) + substituting a simple assignment. + To catch this sort of error, a given RCU-protected pointer may be + tagged with __rcu, after which running sparse + with CONFIG_SPARSE_RCU_POINTER=y will complain + about simple-assignment accesses to that pointer. + Arnd Bergmann made me aware of this requirement, and also + supplied the needed + patch series. +
  4. Kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y + will splat if a data element is passed to call_rcu() + twice in a row, without a grace period in between. + (This error is similar to a double free.) + The corresponding rcu_head structures that are + dynamically allocated are automatically tracked, but + rcu_head structures allocated on the stack + must be initialized with init_rcu_head_on_stack() + and cleaned up with destroy_rcu_head_on_stack(). + Similarly, statically allocated non-stack rcu_head + structures must be initialized with init_rcu_head() + and cleaned up with destroy_rcu_head(). + Mathieu Desnoyers made me aware of this requirement, and also + supplied the needed + patch. +
  5. An infinite loop in an RCU read-side critical section will + eventually trigger an RCU CPU stall warning splat, with + the duration of “eventually” being controlled by the + RCU_CPU_STALL_TIMEOUT Kconfig option, or, + alternatively, by the + rcupdate.rcu_cpu_stall_timeout boot/sysfs + parameter. + However, RCU is not obligated to produce this splat + unless there is a grace period waiting on that particular + RCU read-side critical section. +

    + Some extreme workloads might intentionally delay + RCU grace periods, and systems running those workloads can + be booted with rcupdate.rcu_cpu_stall_suppress + to suppress the splats. + This kernel parameter may also be set via sysfs. + Furthermore, RCU CPU stall warnings are counter-productive + during sysrq dumps and during panics. + RCU therefore supplies the rcu_sysrq_start() and + rcu_sysrq_end() API members to be called before + and after long sysrq dumps. + RCU also supplies the rcu_panic() notifier that is + automatically invoked at the beginning of a panic to suppress + further RCU CPU stall warnings. + +

    + This requirement made itself known in the early 1990s, pretty + much the first time that it was necessary to debug a CPU stall. + That said, the initial implementation in DYNIX/ptx was quite + generic in comparison with that of Linux. +

  6. Although it would be very good to detect pointers leaking out + of RCU read-side critical sections, there is currently no + good way of doing this. + One complication is the need to distinguish between pointers + leaking and pointers that have been handed off from RCU to + some other synchronization mechanism, for example, reference + counting. +
  7. In kernels built with CONFIG_RCU_TRACE=y, RCU-related + information is provided via both debugfs and event tracing. +
  8. Open-coded use of rcu_assign_pointer() and + rcu_dereference() to create typical linked + data structures can be surprisingly error-prone. + Therefore, RCU-protected + linked lists + and, more recently, RCU-protected + hash tables + are available. + Many other special-purpose RCU-protected data structures are + available in the Linux kernel and the userspace RCU library. +
  9. Some linked structures are created at compile time, but still + require __rcu checking. + The RCU_POINTER_INITIALIZER() macro serves this + purpose. +
  10. It is not necessary to use rcu_assign_pointer() + when creating linked structures that are to be published via + a single external pointer. + The RCU_INIT_POINTER() macro is provided for + this task and also for assigning NULL pointers + at runtime. +
+ +

+This not a hard-and-fast list: RCU's diagnostic capabilities will +continue to be guided by the number and type of usage bugs found +in real-world RCU usage. + +

Linux Kernel Complications

+ +

+The Linux kernel provides an interesting environment for all kinds of +software, including RCU. +Some of the relevant points of interest are as follows: + +

    +
  1. Configuration. +
  2. Firmware Interface. +
  3. Early Boot. +
  4. + Interrupts and non-maskable interrupts (NMIs). +
  5. Loadable Modules. +
  6. Hotplug CPU. +
  7. Scheduler and RCU. +
  8. Tracing and RCU. +
  9. Energy Efficiency. +
  10. Memory Efficiency. +
  11. + Performance, Scalability, Response Time, and Reliability. +
+ +

+This list is probably incomplete, but it does give a feel for the +most notable Linux-kernel complications. +Each of the following sections covers one of the above topics. + +

Configuration

+ +

+RCU's goal is automatic configuration, so that almost nobody +needs to worry about RCU's Kconfig options. +And for almost all users, RCU does in fact work well +“out of the box.” + +

+However, there are specialized use cases that are handled by +kernel boot parameters and Kconfig options. +Unfortunately, the Kconfig system will explicitly ask users +about new Kconfig options, which requires almost all of them +be hidden behind a CONFIG_RCU_EXPERT Kconfig option. + +

+This all should be quite obvious, but the fact remains that +Linus Torvalds recently had to +remind +me of this requirement. + +

Firmware Interface

+ +

+In many cases, kernel obtains information about the system from the +firmware, and sometimes things are lost in translation. +Or the translation is accurate, but the original message is bogus. + +

+For example, some systems' firmware overreports the number of CPUs, +sometimes by a large factor. +If RCU naively believed the firmware, as it used to do, +it would create too many per-CPU kthreads. +Although the resulting system will still run correctly, the extra +kthreads needlessly consume memory and can cause confusion +when they show up in ps listings. + +

+RCU must therefore wait for a given CPU to actually come online before +it can allow itself to believe that the CPU actually exists. +The resulting “ghost CPUs” (which are never going to +come online) cause a number of +interesting complications. + +

Early Boot

+ +

+The Linux kernel's boot sequence is an interesting process, +and RCU is used early, even before rcu_init() +is invoked. +In fact, a number of RCU's primitives can be used as soon as the +initial task's task_struct is available and the +boot CPU's per-CPU variables are set up. +The read-side primitives (rcu_read_lock(), +rcu_read_unlock(), rcu_dereference(), +and rcu_access_pointer()) will operate normally very early on, +as will rcu_assign_pointer(). + +

+Although call_rcu() may be invoked at any +time during boot, callbacks are not guaranteed to be invoked until after +the scheduler is fully up and running. +This delay in callback invocation is due to the fact that RCU does not +invoke callbacks until it is fully initialized, and this full initialization +cannot occur until after the scheduler has initialized itself to the +point where RCU can spawn and run its kthreads. +In theory, it would be possible to invoke callbacks earlier, +however, this is not a panacea because there would be severe restrictions +on what operations those callbacks could invoke. + +

+Perhaps surprisingly, synchronize_rcu(), +synchronize_rcu_bh() +(discussed below), +and +synchronize_sched() +will all operate normally +during very early boot, the reason being that there is only one CPU +and preemption is disabled. +This means that the call synchronize_rcu() (or friends) +itself is a quiescent +state and thus a grace period, so the early-boot implementation can +be a no-op. + +

+Both synchronize_rcu_bh() and synchronize_sched() +continue to operate normally through the remainder of boot, courtesy +of the fact that preemption is disabled across their RCU read-side +critical sections and also courtesy of the fact that there is still +only one CPU. +However, once the scheduler starts initializing, preemption is enabled. +There is still only a single CPU, but the fact that preemption is enabled +means that the no-op implementation of synchronize_rcu() no +longer works in CONFIG_PREEMPT=y kernels. +Therefore, as soon as the scheduler starts initializing, the early-boot +fastpath is disabled. +This means that synchronize_rcu() switches to its runtime +mode of operation where it posts callbacks, which in turn means that +any call to synchronize_rcu() will block until the corresponding +callback is invoked. +Unfortunately, the callback cannot be invoked until RCU's runtime +grace-period machinery is up and running, which cannot happen until +the scheduler has initialized itself sufficiently to allow RCU's +kthreads to be spawned. +Therefore, invoking synchronize_rcu() during scheduler +initialization can result in deadlock. + +

Quick Quiz 14: +So what happens with synchronize_rcu() during +scheduler initialization for CONFIG_PREEMPT=n +kernels? +
Answer + +

+I learned of these boot-time requirements as a result of a series of +system hangs. + +

Interrupts and NMIs

+ +

+The Linux kernel has interrupts, and RCU read-side critical sections are +legal within interrupt handlers and within interrupt-disabled regions +of code, as are invocations of call_rcu(). + +

+Some Linux-kernel architectures can enter an interrupt handler from +non-idle process context, and then just never leave it, instead stealthily +transitioning back to process context. +This trick is sometimes used to invoke system calls from inside the kernel. +These “half-interrupts” mean that RCU has to be very careful +about how it counts interrupt nesting levels. +I learned of this requirement the hard way during a rewrite +of RCU's dyntick-idle code. + +

+The Linux kernel has non-maskable interrupts (NMIs), and +RCU read-side critical sections are legal within NMI handlers. +Thankfully, RCU update-side primitives, including +call_rcu(), are prohibited within NMI handlers. + +

+The name notwithstanding, some Linux-kernel architectures +can have nested NMIs, which RCU must handle correctly. +Andy Lutomirski +surprised me +with this requirement; +he also kindly surprised me with +an algorithm +that meets this requirement. + +

Loadable Modules

+ +

+The Linux kernel has loadable modules, and these modules can +also be unloaded. +After a given module has been unloaded, any attempt to call +one of its functions results in a segmentation fault. +The module-unload functions must therefore cancel any +delayed calls to loadable-module functions, for example, +any outstanding mod_timer() must be dealt with +via del_timer_sync() or similar. + +

+Unfortunately, there is no way to cancel an RCU callback; +once you invoke call_rcu(), the callback function is +going to eventually be invoked, unless the system goes down first. +Because it is normally considered socially irresponsible to crash the system +in response to a module unload request, we need some other way +to deal with in-flight RCU callbacks. + +

+RCU therefore provides +rcu_barrier(), +which waits until all in-flight RCU callbacks have been invoked. +If a module uses call_rcu(), its exit function should therefore +prevent any future invocation of call_rcu(), then invoke +rcu_barrier(). +In theory, the underlying module-unload code could invoke +rcu_barrier() unconditionally, but in practice this would +incur unacceptable latencies. + +

+Nikita Danilov noted this requirement for an analogous filesystem-unmount +situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. +The need for rcu_barrier() for module unloading became +apparent later. + +

Hotplug CPU

+ +

+The Linux kernel supports CPU hotplug, which means that CPUs +can come and go. +It is of course illegal to use any RCU API member from an offline CPU. +This requirement was present from day one in DYNIX/ptx, but +on the other hand, the Linux kernel's CPU-hotplug implementation +is “interesting.” + +

+The Linux-kernel CPU-hotplug implementation has notifiers that +are used to allow the various kernel subsystems (including RCU) +to respond appropriately to a given CPU-hotplug operation. +Most RCU operations may be invoked from CPU-hotplug notifiers, +including even normal synchronous grace-period operations +such as synchronize_rcu(). +However, expedited grace-period operations such as +synchronize_rcu_expedited() are not supported, +due to the fact that current implementations block CPU-hotplug +operations, which could result in deadlock. + +

+In addition, all-callback-wait operations such as +rcu_barrier() are also not supported, due to the +fact that there are phases of CPU-hotplug operations where +the outgoing CPU's callbacks will not be invoked until after +the CPU-hotplug operation ends, which could also result in deadlock. + +

Scheduler and RCU

+ +

+RCU depends on the scheduler, and the scheduler uses RCU to +protect some of its data structures. +This means the scheduler is forbidden from acquiring +the runqueue locks and the priority-inheritance locks +in the middle of an outermost RCU read-side critical section unless either +(1) it releases them before exiting that same +RCU read-side critical section, or +(2) interrupts are disabled across +that entire RCU read-side critical section. +This same prohibition also applies (recursively!) to any lock that is acquired +while holding any lock to which this prohibition applies. +Adhering to this rule prevents preemptible RCU from invoking +rcu_read_unlock_special() while either runqueue or +priority-inheritance locks are held, thus avoiding deadlock. + +

+Prior to v4.4, it was only necessary to disable preemption across +RCU read-side critical sections that acquired scheduler locks. +In v4.4, expedited grace periods started using IPIs, and these +IPIs could force a rcu_read_unlock() to take the slowpath. +Therefore, this expedited-grace-period change required disabling of +interrupts, not just preemption. + +

+For RCU's part, the preemptible-RCU rcu_read_unlock() +implementation must be written carefully to avoid similar deadlocks. +In particular, rcu_read_unlock() must tolerate an +interrupt where the interrupt handler invokes both +rcu_read_lock() and rcu_read_unlock(). +This possibility requires rcu_read_unlock() to use +negative nesting levels to avoid destructive recursion via +interrupt handler's use of RCU. + +

+This pair of mutual scheduler-RCU requirements came as a +complete surprise. + +

+As noted above, RCU makes use of kthreads, and it is necessary to +avoid excessive CPU-time accumulation by these kthreads. +This requirement was no surprise, but RCU's violation of it +when running context-switch-heavy workloads when built with +CONFIG_NO_HZ_FULL=y +did come as a surprise [PDF]. +RCU has made good progress towards meeting this requirement, even +for context-switch-have CONFIG_NO_HZ_FULL=y workloads, +but there is room for further improvement. + +

Tracing and RCU

+ +

+It is possible to use tracing on RCU code, but tracing itself +uses RCU. +For this reason, rcu_dereference_raw_notrace() +is provided for use by tracing, which avoids the destructive +recursion that could otherwise ensue. +This API is also used by virtualization in some architectures, +where RCU readers execute in environments in which tracing +cannot be used. +The tracing folks both located the requirement and provided the +needed fix, so this surprise requirement was relatively painless. + +

Energy Efficiency

+ +

+Interrupting idle CPUs is considered socially unacceptable, +especially by people with battery-powered embedded systems. +RCU therefore conserves energy by detecting which CPUs are +idle, including tracking CPUs that have been interrupted from idle. +This is a large part of the energy-efficiency requirement, +so I learned of this via an irate phone call. + +

+Because RCU avoids interrupting idle CPUs, it is illegal to +execute an RCU read-side critical section on an idle CPU. +(Kernels built with CONFIG_PROVE_RCU=y will splat +if you try it.) +The RCU_NONIDLE() macro and _rcuidle +event tracing is provided to work around this restriction. +In addition, rcu_is_watching() may be used to +test whether or not it is currently legal to run RCU read-side +critical sections on this CPU. +I learned of the need for diagnostics on the one hand +and RCU_NONIDLE() on the other while inspecting +idle-loop code. +Steven Rostedt supplied _rcuidle event tracing, +which is used quite heavily in the idle loop. + +

+It is similarly socially unacceptable to interrupt an +nohz_full CPU running in userspace. +RCU must therefore track nohz_full userspace +execution. +And in +CONFIG_NO_HZ_FULL_SYSIDLE=y +kernels, RCU must separately track idle CPUs on the one hand and +CPUs that are either idle or executing in userspace on the other. +In both cases, RCU must be able to sample state at two points in +time, and be able to determine whether or not some other CPU spent +any time idle and/or executing in userspace. + +

+These energy-efficiency requirements have proven quite difficult to +understand and to meet, for example, there have been more than five +clean-sheet rewrites of RCU's energy-efficiency code, the last of +which was finally able to demonstrate +real energy savings running on real hardware [PDF]. +As noted earlier, +I learned of many of these requirements via angry phone calls: +Flaming me on the Linux-kernel mailing list was apparently not +sufficient to fully vent their ire at RCU's energy-efficiency bugs! + +

Memory Efficiency

+ +

+Although small-memory non-realtime systems can simply use Tiny RCU, +code size is only one aspect of memory efficiency. +Another aspect is the size of the rcu_head structure +used by call_rcu() and kfree_rcu(). +Although this structure contains nothing more than a pair of pointers, +it does appear in many RCU-protected data structures, including +some that are size critical. +The page structure is a case in point, as evidenced by +the many occurrences of the union keyword within that structure. + +

+This need for memory efficiency is one reason that RCU uses hand-crafted +singly linked lists to track the rcu_head structures that +are waiting for a grace period to elapse. +It is also the reason why rcu_head structures do not contain +debug information, such as fields tracking the file and line of the +call_rcu() or kfree_rcu() that posted them. +Although this information might appear in debug-only kernel builds at some +point, in the meantime, the ->func field will often provide +the needed debug information. + +

+However, in some cases, the need for memory efficiency leads to even +more extreme measures. +Returning to the page structure, the rcu_head field +shares storage with a great many other structures that are used at +various points in the corresponding page's lifetime. +In order to correctly resolve certain +race conditions, +the Linux kernel's memory-management subsystem needs a particular bit +to remain zero during all phases of grace-period processing, +and that bit happens to map to the bottom bit of the +rcu_head structure's ->next field. +RCU makes this guarantee as long as call_rcu() +is used to post the callback, as opposed to kfree_rcu() +or some future “lazy” +variant of call_rcu() that might one day be created for +energy-efficiency purposes. + +

+Performance, Scalability, Response Time, and Reliability

+ +

+Expanding on the +earlier discussion, +RCU is used heavily by hot code paths in performance-critical +portions of the Linux kernel's networking, security, virtualization, +and scheduling code paths. +RCU must therefore use efficient implementations, especially in its +read-side primitives. +To that end, it would be good if preemptible RCU's implementation +of rcu_read_lock() could be inlined, however, doing +this requires resolving #include issues with the +task_struct structure. + +

+The Linux kernel supports hardware configurations with up to +4096 CPUs, which means that RCU must be extremely scalable. +Algorithms that involve frequent acquisitions of global locks or +frequent atomic operations on global variables simply cannot be +tolerated within the RCU implementation. +RCU therefore makes heavy use of a combining tree based on the +rcu_node structure. +RCU is required to tolerate all CPUs continuously invoking any +combination of RCU's runtime primitives with minimal per-operation +overhead. +In fact, in many cases, increasing load must decrease the +per-operation overhead, witness the batching optimizations for +synchronize_rcu(), call_rcu(), +synchronize_rcu_expedited(), and rcu_barrier(). +As a general rule, RCU must cheerfully accept whatever the +rest of the Linux kernel decides to throw at it. + +

+The Linux kernel is used for real-time workloads, especially +in conjunction with the +-rt patchset. +The real-time-latency response requirements are such that the +traditional approach of disabling preemption across RCU +read-side critical sections is inappropriate. +Kernels built with CONFIG_PREEMPT=y therefore +use an RCU implementation that allows RCU read-side critical +sections to be preempted. +This requirement made its presence known after users made it +clear that an earlier +real-time patch +did not meet their needs, in conjunction with some +RCU issues +encountered by a very early version of the -rt patchset. + +

+In addition, RCU must make do with a sub-100-microsecond real-time latency +budget. +In fact, on smaller systems with the -rt patchset, the Linux kernel +provides sub-20-microsecond real-time latencies for the whole kernel, +including RCU. +RCU's scalability and latency must therefore be sufficient for +these sorts of configurations. +To my surprise, the sub-100-microsecond real-time latency budget + +applies to even the largest systems [PDF], +up to and including systems with 4096 CPUs. +This real-time requirement motivated the grace-period kthread, which +also simplified handling of a number of race conditions. + +

+Finally, RCU's status as a synchronization primitive means that +any RCU failure can result in arbitrary memory corruption that can be +extremely difficult to debug. +This means that RCU must be extremely reliable, which in +practice also means that RCU must have an aggressive stress-test +suite. +This stress-test suite is called rcutorture. + +

+Although the need for rcutorture was no surprise, +the current immense popularity of the Linux kernel is posing +interesting—and perhaps unprecedented—validation +challenges. +To see this, keep in mind that there are well over one billion +instances of the Linux kernel running today, given Android +smartphones, Linux-powered televisions, and servers. +This number can be expected to increase sharply with the advent of +the celebrated Internet of Things. + +

+Suppose that RCU contains a race condition that manifests on average +once per million years of runtime. +This bug will be occurring about three times per day across +the installed base. +RCU could simply hide behind hardware error rates, given that no one +should really expect their smartphone to last for a million years. +However, anyone taking too much comfort from this thought should +consider the fact that in most jurisdictions, a successful multi-year +test of a given mechanism, which might include a Linux kernel, +suffices for a number of types of safety-critical certifications. +In fact, rumor has it that the Linux kernel is already being used +in production for safety-critical applications. +I don't know about you, but I would feel quite bad if a bug in RCU +killed someone. +Which might explain my recent focus on validation and verification. + +

Other RCU Flavors

+ +

+One of the more surprising things about RCU is that there are now +no fewer than five flavors, or API families. +In addition, the primary flavor that has been the sole focus up to +this point has two different implementations, non-preemptible and +preemptible. +The other four flavors are listed below, with requirements for each +described in a separate section. + +

    +
  1. Bottom-Half Flavor +
  2. Sched Flavor +
  3. Sleepable RCU +
  4. Tasks RCU +
+ +

Bottom-Half Flavor

+ +

+The softirq-disable (AKA “bottom-half”, +hence the “_bh” abbreviations) +flavor of RCU, or RCU-bh, was developed by +Dipankar Sarma to provide a flavor of RCU that could withstand the +network-based denial-of-service attacks researched by Robert +Olsson. +These attacks placed so much networking load on the system +that some of the CPUs never exited softirq execution, +which in turn prevented those CPUs from ever executing a context switch, +which, in the RCU implementation of that time, prevented grace periods +from ever ending. +The result was an out-of-memory condition and a system hang. + +

+The solution was the creation of RCU-bh, which does +local_bh_disable() +across its read-side critical sections, and which uses the transition +from one type of softirq processing to another as a quiescent state +in addition to context switch, idle, user mode, and offline. +This means that RCU-bh grace periods can complete even when some of +the CPUs execute in softirq indefinitely, thus allowing algorithms +based on RCU-bh to withstand network-based denial-of-service attacks. + +

+Because +rcu_read_lock_bh() and rcu_read_unlock_bh() +disable and re-enable softirq handlers, any attempt to start a softirq +handlers during the +RCU-bh read-side critical section will be deferred. +In this case, rcu_read_unlock_bh() +will invoke softirq processing, which can take considerable time. +One can of course argue that this softirq overhead should be associated +with the code following the RCU-bh read-side critical section rather +than rcu_read_unlock_bh(), but the fact +is that most profiling tools cannot be expected to make this sort +of fine distinction. +For example, suppose that a three-millisecond-long RCU-bh read-side +critical section executes during a time of heavy networking load. +There will very likely be an attempt to invoke at least one softirq +handler during that three milliseconds, but any such invocation will +be delayed until the time of the rcu_read_unlock_bh(). +This can of course make it appear at first glance as if +rcu_read_unlock_bh() was executing very slowly. + +

+The +RCU-bh API +includes +rcu_read_lock_bh(), +rcu_read_unlock_bh(), +rcu_dereference_bh(), +rcu_dereference_bh_check(), +synchronize_rcu_bh(), +synchronize_rcu_bh_expedited(), +call_rcu_bh(), +rcu_barrier_bh(), and +rcu_read_lock_bh_held(). + +

Sched Flavor

+ +

+Before preemptible RCU, waiting for an RCU grace period had the +side effect of also waiting for all pre-existing interrupt +and NMI handlers. +However, there are legitimate preemptible-RCU implementations that +do not have this property, given that any point in the code outside +of an RCU read-side critical section can be a quiescent state. +Therefore, RCU-sched was created, which follows “classic” +RCU in that an RCU-sched grace period waits for for pre-existing +interrupt and NMI handlers. +In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched +APIs have identical implementations, while kernels built with +CONFIG_PREEMPT=y provide a separate implementation for each. + +

+Note well that in CONFIG_PREEMPT=y kernels, +rcu_read_lock_sched() and rcu_read_unlock_sched() +disable and re-enable preemption, respectively. +This means that if there was a preemption attempt during the +RCU-sched read-side critical section, rcu_read_unlock_sched() +will enter the scheduler, with all the latency and overhead entailed. +Just as with rcu_read_unlock_bh(), this can make it look +as if rcu_read_unlock_sched() was executing very slowly. +However, the highest-priority task won't be preempted, so that task +will enjoy low-overhead rcu_read_unlock_sched() invocations. + +

+The +RCU-sched API +includes +rcu_read_lock_sched(), +rcu_read_unlock_sched(), +rcu_read_lock_sched_notrace(), +rcu_read_unlock_sched_notrace(), +rcu_dereference_sched(), +rcu_dereference_sched_check(), +synchronize_sched(), +synchronize_rcu_sched_expedited(), +call_rcu_sched(), +rcu_barrier_sched(), and +rcu_read_lock_sched_held(). +However, anything that disables preemption also marks an RCU-sched +read-side critical section, including +preempt_disable() and preempt_enable(), +local_irq_save() and local_irq_restore(), +and so on. + +

Sleepable RCU

+ +

+For well over a decade, someone saying “I need to block within +an RCU read-side critical section” was a reliable indication +that this someone did not understand RCU. +After all, if you are always blocking in an RCU read-side critical +section, you can probably afford to use a higher-overhead synchronization +mechanism. +However, that changed with the advent of the Linux kernel's notifiers, +whose RCU read-side critical +sections almost never sleep, but sometimes need to. +This resulted in the introduction of +sleepable RCU, +or SRCU. + +

+SRCU allows different domains to be defined, with each such domain +defined by an instance of an srcu_struct structure. +A pointer to this structure must be passed in to each SRCU function, +for example, synchronize_srcu(&ss), where +ss is the srcu_struct structure. +The key benefit of these domains is that a slow SRCU reader in one +domain does not delay an SRCU grace period in some other domain. +That said, one consequence of these domains is that read-side code +must pass a “cookie” from srcu_read_lock() +to srcu_read_unlock(), for example, as follows: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&ss, idx);
+
+
+ +

+As noted above, it is legal to block within SRCU read-side critical sections, +however, with great power comes great responsibility. +If you block forever in one of a given domain's SRCU read-side critical +sections, then that domain's grace periods will also be blocked forever. +Of course, one good way to block forever is to deadlock, which can +happen if any operation in a given domain's SRCU read-side critical +section can block waiting, either directly or indirectly, for that domain's +grace period to elapse. +For example, this results in a self-deadlock: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 synchronize_srcu(&ss);
+ 6 srcu_read_unlock(&ss, idx);
+
+
+ +

+However, if line 5 acquired a mutex that was held across +a synchronize_srcu() for domain ss, +deadlock would still be possible. +Furthermore, if line 5 acquired a mutex that was held across +a synchronize_srcu() for some other domain ss1, +and if an ss1-domain SRCU read-side critical section +acquired another mutex that was held across as ss-domain +synchronize_srcu(), +deadlock would again be possible. +Such a deadlock cycle could extend across an arbitrarily large number +of different SRCU domains. +Again, with great power comes great responsibility. + +

+Unlike the other RCU flavors, SRCU read-side critical sections can +run on idle and even offline CPUs. +This ability requires that srcu_read_lock() and +srcu_read_unlock() contain memory barriers, which means +that SRCU readers will run a bit slower than would RCU readers. +It also motivates the smp_mb__after_srcu_read_unlock() +API, which, in combination with srcu_read_unlock(), +guarantees a full memory barrier. + +

+The +SRCU API +includes +srcu_read_lock(), +srcu_read_unlock(), +srcu_dereference(), +srcu_dereference_check(), +synchronize_srcu(), +synchronize_srcu_expedited(), +call_srcu(), +srcu_barrier(), and +srcu_read_lock_held(). +It also includes +DEFINE_SRCU(), +DEFINE_STATIC_SRCU(), and +init_srcu_struct() +APIs for defining and initializing srcu_struct structures. + +

Tasks RCU

+ +

+Some forms of tracing use “tramopolines” to handle the +binary rewriting required to install different types of probes. +It would be good to be able to free old trampolines, which sounds +like a job for some form of RCU. +However, because it is necessary to be able to install a trace +anywhere in the code, it is not possible to use read-side markers +such as rcu_read_lock() and rcu_read_unlock(). +In addition, it does not work to have these markers in the trampoline +itself, because there would need to be instructions following +rcu_read_unlock(). +Although synchronize_rcu() would guarantee that execution +reached the rcu_read_unlock(), it would not be able to +guarantee that execution had completely left the trampoline. + +

+The solution, in the form of +Tasks RCU, +is to have implicit +read-side critical sections that are delimited by voluntary context +switches, that is, calls to schedule(), +cond_resched_rcu_qs(), and +synchronize_rcu_tasks(). +In addition, transitions to and from userspace execution also delimit +tasks-RCU read-side critical sections. + +

+The tasks-RCU API is quite compact, consisting only of +call_rcu_tasks(), +synchronize_rcu_tasks(), and +rcu_barrier_tasks(). + +

Possible Future Changes

+ +

+One of the tricks that RCU uses to attain update-side scalability is +to increase grace-period latency with increasing numbers of CPUs. +If this becomes a serious problem, it will be necessary to rework the +grace-period state machine so as to avoid the need for the additional +latency. + +

+Expedited grace periods scan the CPUs, so their latency and overhead +increases with increasing numbers of CPUs. +If this becomes a serious problem on large systems, it will be necessary +to do some redesign to avoid this scalability problem. + +

+RCU disables CPU hotplug in a few places, perhaps most notably in the +expedited grace-period and rcu_barrier() operations. +If there is a strong reason to use expedited grace periods in CPU-hotplug +notifiers, it will be necessary to avoid disabling CPU hotplug. +This would introduce some complexity, so there had better be a very +good reason. + +

+The tradeoff between grace-period latency on the one hand and interruptions +of other CPUs on the other hand may need to be re-examined. +The desire is of course for zero grace-period latency as well as zero +interprocessor interrupts undertaken during an expedited grace period +operation. +While this ideal is unlikely to be achievable, it is quite possible that +further improvements can be made. + +

+The multiprocessor implementations of RCU use a combining tree that +groups CPUs so as to reduce lock contention and increase cache locality. +However, this combining tree does not spread its memory across NUMA +nodes nor does it align the CPU groups with hardware features such +as sockets or cores. +Such spreading and alignment is currently believed to be unnecessary +because the hotpath read-side primitives do not access the combining +tree, nor does call_rcu() in the common case. +If you believe that your architecture needs such spreading and alignment, +then your architecture should also benefit from the +rcutree.rcu_fanout_leaf boot parameter, which can be set +to the number of CPUs in a socket, NUMA node, or whatever. +If the number of CPUs is too large, use a fraction of the number of +CPUs. +If the number of CPUs is a large prime number, well, that certainly +is an “interesting” architectural choice! +More flexible arrangements might be considered, but only if +rcutree.rcu_fanout_leaf has proven inadequate, and only +if the inadequacy has been demonstrated by a carefully run and +realistic system-level workload. + +

+Please note that arrangements that require RCU to remap CPU numbers will +require extremely good demonstration of need and full exploration of +alternatives. + +

+There is an embarrassingly large number of flavors of RCU, and this +number has been increasing over time. +Perhaps it will be possible to combine some at some future date. + +

+RCU's various kthreads are reasonably recent additions. +It is quite likely that adjustments will be required to more gracefully +handle extreme loads. +It might also be necessary to be able to relate CPU utilization by +RCU's kthreads and softirq handlers to the code that instigated this +CPU utilization. +For example, RCU callback overhead might be charged back to the +originating call_rcu() instance, though probably not +in production kernels. + +

Summary

+ +

+This document has presented more than two decade's worth of RCU +requirements. +Given that the requirements keep changing, this will not be the last +word on this subject, but at least it serves to get an important +subset of the requirements set forth. + +

Acknowledgments

+ +I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, +Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and +Andy Lutomirski for their help in rendering +this article human readable, and to Michelle Rankin for her support +of this effort. +Other contributions are acknowledged in the Linux kernel's git archive. +The cartoon is copyright (c) 2013 by Melissa Broussard, +and is provided +under the terms of the Creative Commons Attribution-Share Alike 3.0 +United States license. + +

+Answers to Quick Quizzes

+ + +

Quick Quiz 1: +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +synchronize_rcu()!!! +Just who are you trying to fool??? + + +

Answer: +First, if updaters do not wish to be blocked by readers, they can use +call_rcu() or kfree_rcu(), which will +be discussed later. +Second, even when using synchronize_rcu(), the other +update-side code does run concurrently with readers, whether pre-existing +or not. + + +

Back to Quick Quiz 1. + + +

Quick Quiz 2: +Why is the synchronize_rcu() on line 28 needed? + + +

Answer: +Without that extra grace period, memory reordering could result in +do_something_dlm() executing do_something() +concurrently with the last bits of recovery(). + + +

Back to Quick Quiz 2. + + +

Quick Quiz 3: +But rcu_assign_pointer() does nothing to prevent the +two assignments to p->a and p->b +from being reordered. +Can't that also cause problems? + + +

Answer: +No, it cannot. +The readers cannot see either of these two fields until +the assignment to gp, by which time both fields are +fully initialized. +So reordering the assignments +to p->a and p->b cannot possibly +cause any problems. + + +

Back to Quick Quiz 3. + + +

Quick Quiz 4: +Without the rcu_dereference() or the +rcu_access_pointer(), what destructive optimizations +might the compiler make use of? + + +

Answer: +Let's start with what happens to do_something_gp() +if it fails to use rcu_dereference(). +It could reuse a value formerly fetched from this same pointer. +It could also fetch the pointer from gp in a byte-at-a-time +manner, resulting in load tearing, in turn resulting a bytewise +mash-up of two distince pointer values. +It might even use value-speculation optimizations, where it makes a wrong +guess, but by the time it gets around to checking the value, an update +has changed the pointer to match the wrong guess. +Too bad about any dereferences that returned pre-initialization garbage +in the meantime! + +

+For remove_gp_synchronous(), as long as all modifications +to gp are carried out while holding gp_lock, +the above optimizations are harmless. +However, +with CONFIG_SPARSE_RCU_POINTER=y, +sparse will complain if you +define gp with __rcu and then +access it without using +either rcu_access_pointer() or rcu_dereference(). + + +

Back to Quick Quiz 4. + + +

Quick Quiz 5: +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of synchronize_rcu()? + + +

Answer: +If RCU cannot tell whether or not a given +RCU read-side critical section starts before a +given instance of synchronize_rcu(), +then it must assume that the RCU read-side critical section +started first. +In other words, a given instance of synchronize_rcu() +can avoid waiting on a given RCU read-side critical section only +if it can prove that synchronize_rcu() started first. + + +

Back to Quick Quiz 5. + + +

Quick Quiz 6: +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers really required? + + +

Answer: +Yes, they really are required. +To see why the first guarantee is required, consider the following +sequence of events: + +

    +
  1. CPU 1: rcu_read_lock() +
  2. CPU 1: q = rcu_dereference(gp); + /* Very likely to return p. */ +
  3. CPU 0: list_del_rcu(p); +
  4. CPU 0: synchronize_rcu() starts. +
  5. CPU 1: do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */ +
  6. CPU 1: rcu_read_unlock() +
  7. CPU 0: synchronize_rcu() returns. +
  8. CPU 0: kfree(p); +
+ +

+Therefore, there absolutely must be a full memory barrier between the +end of the RCU read-side critical section and the end of the +grace period. + +

+The sequence of events demonstrating the necessity of the second rule +is roughly similar: + +

    +
  1. CPU 0: list_del_rcu(p); +
  2. CPU 0: synchronize_rcu() starts. +
  3. CPU 1: rcu_read_lock() +
  4. CPU 1: q = rcu_dereference(gp); + /* Might return p if no memory barrier. */ +
  5. CPU 0: synchronize_rcu() returns. +
  6. CPU 0: kfree(p); +
  7. CPU 1: do_something_with(q->a); /* Boom!!! */ +
  8. CPU 1: rcu_read_unlock() +
+ +

+And similarly, without a memory barrier between the beginning of the +grace period and the beginning of the RCU read-side critical section, +CPU 1 might end up accessing the freelist. + +

+The “as if” rule of course applies, so that any implementation +that acts as if the appropriate memory barriers were in place is a +correct implementation. +That said, it is much easier to fool yourself into believing that you have +adhered to the as-if rule than it is to actually adhere to it! + + +

Back to Quick Quiz 6. + + +

Quick Quiz 7: +But how does the upgrade-to-write operation exclude other readers? + + +

Answer: +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. + + +

Back to Quick Quiz 7. + + +

Quick Quiz 8: +Can't the compiler also reorder this code? + + +

Answer: +No, the volatile casts in READ_ONCE() and +WRITE_ONCE() prevent the compiler from reordering in +this particular case. + + +

Back to Quick Quiz 8. + + +

Quick Quiz 9: +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? + + +

Answer: +No. +Even if synchronize_rcu() were to wait until +all readers had completed, a new reader might start immediately after +synchronize_rcu() completed. +Therefore, the code following +synchronize_rcu() cannot rely on there being no readers +in any case. + + +

Back to Quick Quiz 9. + + +

Quick Quiz 10: +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? + + +

Answer: +In theory, an infinite number. +In practice, an unknown number that is sensitive to both implementation +details and timing considerations. +Therefore, even in practice, RCU users must abide by the theoretical rather +than the practical answer. + + +

Back to Quick Quiz 10. + + +

Quick Quiz 11: +What about sleeping locks? + + +

Answer: +These are forbidden within Linux-kernel RCU read-side critical sections +because it is not legal to place a quiescent state (in this case, +voluntary context switch) within an RCU read-side critical section. +However, sleeping locks may be used within userspace RCU read-side critical +sections, and also within Linux-kernel sleepable RCU +(SRCU) +read-side critical sections. +In addition, the -rt patchset turns spinlocks into a sleeping locks so +that the corresponding critical sections can be preempted, which +also means that these sleeplockified spinlocks (but not other sleeping locks!) +may be acquire within -rt-Linux-kernel RCU read-side critical sections. + +

+Note that it is legal for a normal RCU read-side critical section +to conditionally acquire a sleeping locks (as in mutex_trylock()), +but only as long as it does not loop indefinitely attempting to +conditionally acquire that sleeping locks. +The key point is that things like mutex_trylock() +either return with the mutex held, or return an error indication if +the mutex was not immediately available. +Either way, mutex_trylock() returns immediately without sleeping. + + +

Back to Quick Quiz 11. + + +

Quick Quiz 12: +Why does line 19 use rcu_access_pointer()? +After all, call_rcu() on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that rcu_dereference() is required? + + +

Answer: +Presumably the ->gp_lock acquired on line 18 excludes +any changes, including any insertions that rcu_dereference() +would protect against. +Therefore, any insertions will be delayed until after ->gp_lock +is released on line 25, which in turn means that +rcu_access_pointer() suffices. + + +

Back to Quick Quiz 12. + + +

Quick Quiz 13: +Earlier it was claimed that call_rcu() and +kfree_rcu() allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? + + +

Answer: +We could define things this way, but keep in mind that this sort of +definition would say that updates in garbage-collected languages +cannot complete until the next time the garbage collector runs, +which does not seem at all reasonable. +The key point is that in most cases, an updater using either +call_rcu() or kfree_rcu() can proceed to the +next update as soon as it has invoked call_rcu() or +kfree_rcu(), without having to wait for a subsequent +grace period. + + +

Back to Quick Quiz 13. + + +

Quick Quiz 14: +So what happens with synchronize_rcu() during +scheduler initialization for CONFIG_PREEMPT=n +kernels? + + +

Answer: +In CONFIG_PREEMPT=n kernel, synchronize_rcu() +maps directly to synchronize_sched(). +Therefore, synchronize_rcu() works normally throughout +boot in CONFIG_PREEMPT=n kernels. +However, your code must also work in CONFIG_PREEMPT=y kernels, +so it is still necessary to avoid invoking synchronize_rcu() +during scheduler initialization. + + +

Back to Quick Quiz 14. + + + diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx new file mode 100644 index 000000000..3a97ba490 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -0,0 +1,2741 @@ + + + A Tour Through RCU's Requirements [LWN.net] + + +

A Tour Through RCU's Requirements

+ +

Copyright IBM Corporation, 2015

+

Author: Paul E. McKenney

+

The initial version of this document appeared in the +LWN articles +here, +here, and +here.

+ +

Introduction

+ +

+Read-copy update (RCU) is a synchronization mechanism that is often +used as a replacement for reader-writer locking. +RCU is unusual in that updaters do not block readers, +which means that RCU's read-side primitives can be exceedingly fast +and scalable. +In addition, updaters can make useful forward progress concurrently +with readers. +However, all this concurrency between RCU readers and updaters does raise +the question of exactly what RCU readers are doing, which in turn +raises the question of exactly what RCU's requirements are. + +

+This document therefore summarizes RCU's requirements, and can be thought +of as an informal, high-level specification for RCU. +It is important to understand that RCU's specification is primarily +empirical in nature; +in fact, I learned about many of these requirements the hard way. +This situation might cause some consternation, however, not only +has this learning process been a lot of fun, but it has also been +a great privilege to work with so many people willing to apply +technologies in interesting new ways. + +

+All that aside, here are the categories of currently known RCU requirements: +

+ +
    +
  1. + Fundamental Requirements +
  2. Fundamental Non-Requirements +
  3. + Parallelism Facts of Life +
  4. + Quality-of-Implementation Requirements +
  5. + Linux Kernel Complications +
  6. + Software-Engineering Requirements +
  7. + Other RCU Flavors +
  8. + Possible Future Changes +
+ +

+This is followed by a summary, +which is in turn followed by the inevitable +answers to the quick quizzes. + +

Fundamental Requirements

+ +

+RCU's fundamental requirements are the closest thing RCU has to hard +mathematical requirements. +These are: + +

    +
  1. + Grace-Period Guarantee +
  2. + Publish-Subscribe Guarantee +
  3. + Memory-Barrier Guarantees +
  4. + RCU Primitives Guaranteed to Execute Unconditionally +
  5. + Guaranteed Read-to-Write Upgrade +
+ +

Grace-Period Guarantee

+ +

+RCU's grace-period guarantee is unusual in being premeditated: +Jack Slingwine and I had this guarantee firmly in mind when we started +work on RCU (then called “rclock”) in the early 1990s. +That said, the past two decades of experience with RCU have produced +a much more detailed understanding of this guarantee. + +

+RCU's grace-period guarantee allows updaters to wait for the completion +of all pre-existing RCU read-side critical sections. +An RCU read-side critical section +begins with the marker rcu_read_lock() and ends with +the marker rcu_read_unlock(). +These markers may be nested, and RCU treats a nested set as one +big RCU read-side critical section. +Production-quality implementations of rcu_read_lock() and +rcu_read_unlock() are extremely lightweight, and in +fact have exactly zero overhead in Linux kernels built for production +use with CONFIG_PREEMPT=n. + +

+This guarantee allows ordering to be enforced with extremely low +overhead to readers, for example: + +

+
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5   rcu_read_lock();
+ 6   r1 = READ_ONCE(x);
+ 7   r2 = READ_ONCE(y);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   WRITE_ONCE(x, 1);
+14   synchronize_rcu();
+15   WRITE_ONCE(y, 1);
+16 }
+
+
+ +

+Because the synchronize_rcu() on line 14 waits for +all pre-existing readers, any instance of thread0() that +loads a value of zero from x must complete before +thread1() stores to y, so that instance must +also load a value of zero from y. +Similarly, any instance of thread0() that loads a value of +one from y must have started after the +synchronize_rcu() started, and must therefore also load +a value of one from x. +Therefore, the outcome: +

+
+(r1 == 0 && r2 == 1)
+
+
+cannot happen. + +

@@QQ@@ +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +synchronize_rcu()!!! +Just who are you trying to fool??? +

@@QQA@@ +First, if updaters do not wish to be blocked by readers, they can use +call_rcu() or kfree_rcu(), which will +be discussed later. +Second, even when using synchronize_rcu(), the other +update-side code does run concurrently with readers, whether pre-existing +or not. +

@@QQE@@ + +

+This scenario resembles one of the first uses of RCU in +DYNIX/ptx, +which managed a distributed lock manager's transition into +a state suitable for handling recovery from node failure, +more or less as follows: + +

+
+ 1 #define STATE_NORMAL        0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING    2
+ 4 #define STATE_WANT_NORMAL   3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10   int state_snap;
+11
+12   rcu_read_lock();
+13   state_snap = READ_ONCE(state);
+14   if (state_snap == STATE_NORMAL)
+15     do_something();
+16   else
+17     do_something_carefully();
+18   rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24   synchronize_rcu();
+25   WRITE_ONCE(state, STATE_RECOVERING);
+26   recovery();
+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
+28   synchronize_rcu();
+29   WRITE_ONCE(state, STATE_NORMAL);
+30 }
+
+
+ +

+The RCU read-side critical section in do_something_dlm() +works with the synchronize_rcu() in start_recovery() +to guarantee that do_something() never runs concurrently +with recovery(), but with little or no synchronization +overhead in do_something_dlm(). + +

@@QQ@@ +Why is the synchronize_rcu() on line 28 needed? +

@@QQA@@ +Without that extra grace period, memory reordering could result in +do_something_dlm() executing do_something() +concurrently with the last bits of recovery(). +

@@QQE@@ + +

+In order to avoid fatal problems such as deadlocks, +an RCU read-side critical section must not contain calls to +synchronize_rcu(). +Similarly, an RCU read-side critical section must not +contain anything that waits, directly or indirectly, on completion of +an invocation of synchronize_rcu(). + +

+Although RCU's grace-period guarantee is useful in and of itself, with +quite a few use cases, +it would be good to be able to use RCU to coordinate read-side +access to linked data structures. +For this, the grace-period guarantee is not sufficient, as can +be seen in function add_gp_buggy() below. +We will look at the reader's code later, but in the meantime, just think of +the reader as locklessly picking up the gp pointer, +and, if the value loaded is non-NULL, locklessly accessing the +->a and ->b fields. + +

+
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   gp = p; /* ORDERING BUG */
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The problem is that both the compiler and weakly ordered CPUs are within +their rights to reorder this code as follows: + +

+
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   gp = p; /* ORDERING BUG */
+12   p->a = a;
+13   p->b = a;
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+If an RCU reader fetches gp just after +add_gp_buggy_optimized executes line 11, +it will see garbage in the ->a and ->b +fields. +And this is but one of many ways in which compiler and hardware optimizations +could cause trouble. +Therefore, we clearly need some way to prevent the compiler and the CPU from +reordering in this manner, which brings us to the publish-subscribe +guarantee discussed in the next section. + +

Publish/Subscribe Guarantee

+ +

+RCU's publish-subscribe guarantee allows data to be inserted +into a linked data structure without disrupting RCU readers. +The updater uses rcu_assign_pointer() to insert the +new data, and readers use rcu_dereference() to +access data, whether new or old. +The following shows an example of insertion: + +

+
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   rcu_assign_pointer(gp, p);
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The rcu_assign_pointer() on line 13 is conceptually +equivalent to a simple assignment statement, but also guarantees +that its assignment will +happen after the two assignments in lines 11 and 12, +similar to the C11 memory_order_release store operation. +It also prevents any number of “interesting” compiler +optimizations, for example, the use of gp as a scratch +location immediately preceding the assignment. + +

@@QQ@@ +But rcu_assign_pointer() does nothing to prevent the +two assignments to p->a and p->b +from being reordered. +Can't that also cause problems? +

@@QQA@@ +No, it cannot. +The readers cannot see either of these two fields until +the assignment to gp, by which time both fields are +fully initialized. +So reordering the assignments +to p->a and p->b cannot possibly +cause any problems. +

@@QQE@@ + +

+It is tempting to assume that the reader need not do anything special +to control its accesses to the RCU-protected data, +as shown in do_something_gp_buggy() below: + +

+
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+However, this temptation must be resisted because there are a +surprisingly large number of ways that the compiler +(to say nothing of +DEC Alpha CPUs) +can trip this code up. +For but one example, if the compiler were short of registers, it +might choose to refetch from gp rather than keeping +a separate copy in p as follows: + +

+
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
+ 5     do_something(gp->a, gp->b);
+ 6     rcu_read_unlock();
+ 7     return true;
+ 8   }
+ 9   rcu_read_unlock();
+10   return false;
+11 }
+
+
+ +

+If this function ran concurrently with a series of updates that +replaced the current structure with a new one, +the fetches of gp->a +and gp->b might well come from two different structures, +which could cause serious confusion. +To prevent this (and much else besides), do_something_gp() uses +rcu_dereference() to fetch from gp: + +

+
+ 1 bool do_something_gp(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = rcu_dereference(gp);
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+The rcu_dereference() uses volatile casts and (for DEC Alpha) +memory barriers in the Linux kernel. +Should a +high-quality implementation of C11 memory_order_consume [PDF] +ever appear, then rcu_dereference() could be implemented +as a memory_order_consume load. +Regardless of the exact implementation, a pointer fetched by +rcu_dereference() may not be used outside of the +outermost RCU read-side critical section containing that +rcu_dereference(), unless protection of +the corresponding data element has been passed from RCU to some +other synchronization mechanism, most commonly locking or +reference counting. + +

+In short, updaters use rcu_assign_pointer() and readers +use rcu_dereference(), and these two RCU API elements +work together to ensure that readers have a consistent view of +newly added data elements. + +

+Of course, it is also necessary to remove elements from RCU-protected +data structures, for example, using the following process: + +

    +
  1. Remove the data element from the enclosing structure. +
  2. Wait for all pre-existing RCU read-side critical sections + to complete (because only pre-existing readers can possibly have + a reference to the newly removed data element). +
  3. At this point, only the updater has a reference to the + newly removed data element, so it can safely reclaim + the data element, for example, by passing it to kfree(). +
+ +This process is implemented by remove_gp_synchronous(): + +
+
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3   struct foo *p;
+ 4
+ 5   spin_lock(&gp_lock);
+ 6   p = rcu_access_pointer(gp);
+ 7   if (!p) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   rcu_assign_pointer(gp, NULL);
+12   spin_unlock(&gp_lock);
+13   synchronize_rcu();
+14   kfree(p);
+15   return true;
+16 }
+
+
+ +

+This function is straightforward, with line 13 waiting for a grace +period before line 14 frees the old data element. +This waiting ensures that readers will reach line 7 of +do_something_gp() before the data element referenced by +p is freed. +The rcu_access_pointer() on line 6 is similar to +rcu_dereference(), except that: + +

    +
  1. The value returned by rcu_access_pointer() + cannot be dereferenced. + If you want to access the value pointed to as well as + the pointer itself, use rcu_dereference() + instead of rcu_access_pointer(). +
  2. The call to rcu_access_pointer() need not be + protected. + In contrast, rcu_dereference() must either be + within an RCU read-side critical section or in a code + segment where the pointer cannot change, for example, in + code protected by the corresponding update-side lock. +
+ +

@@QQ@@ +Without the rcu_dereference() or the +rcu_access_pointer(), what destructive optimizations +might the compiler make use of? +

@@QQA@@ +Let's start with what happens to do_something_gp() +if it fails to use rcu_dereference(). +It could reuse a value formerly fetched from this same pointer. +It could also fetch the pointer from gp in a byte-at-a-time +manner, resulting in load tearing, in turn resulting a bytewise +mash-up of two distince pointer values. +It might even use value-speculation optimizations, where it makes a wrong +guess, but by the time it gets around to checking the value, an update +has changed the pointer to match the wrong guess. +Too bad about any dereferences that returned pre-initialization garbage +in the meantime! + +

+For remove_gp_synchronous(), as long as all modifications +to gp are carried out while holding gp_lock, +the above optimizations are harmless. +However, +with CONFIG_SPARSE_RCU_POINTER=y, +sparse will complain if you +define gp with __rcu and then +access it without using +either rcu_access_pointer() or rcu_dereference(). +

@@QQE@@ + +

+In short, RCU's publish-subscribe guarantee is provided by the combination +of rcu_assign_pointer() and rcu_dereference(). +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +

+This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling rcu_dereference() for subscription, nor did it +have anything resembling the smp_read_barrier_depends() +that was later subsumed into rcu_dereference(). +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good two hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting rcu_dereference()! + +

Memory-Barrier Guarantees

+ +

+The previous section's simple linked-data-structure scenario clearly +demonstrates the need for RCU's stringent memory-ordering guarantees on +systems with more than one CPU: + +

    +
  1. Each CPU that has an RCU read-side critical section that + begins before synchronize_rcu() starts is + guaranteed to execute a full memory barrier between the time + that the RCU read-side critical section ends and the time that + synchronize_rcu() returns. + Without this guarantee, a pre-existing RCU read-side critical section + might hold a reference to the newly removed struct foo + after the kfree() on line 14 of + remove_gp_synchronous(). +
  2. Each CPU that has an RCU read-side critical section that ends + after synchronize_rcu() returns is guaranteed + to execute a full memory barrier between the time that + synchronize_rcu() begins and the time that the RCU + read-side critical section begins. + Without this guarantee, a later RCU read-side critical section + running after the kfree() on line 14 of + remove_gp_synchronous() might + later run do_something_gp() and find the + newly deleted struct foo. +
  3. If the task invoking synchronize_rcu() remains + on a given CPU, then that CPU is guaranteed to execute a full + memory barrier sometime during the execution of + synchronize_rcu(). + This guarantee ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on line 11. +
  4. If the task invoking synchronize_rcu() migrates + among a group of CPUs during that invocation, then each of the + CPUs in that group is guaranteed to execute a full memory barrier + sometime during the execution of synchronize_rcu(). + This guarantee also ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on + line 11, but also in the case where the thread executing the + synchronize_rcu() migrates in the meantime. +
+ +

@@QQ@@ +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of synchronize_rcu()? +

@@QQA@@ +If RCU cannot tell whether or not a given +RCU read-side critical section starts before a +given instance of synchronize_rcu(), +then it must assume that the RCU read-side critical section +started first. +In other words, a given instance of synchronize_rcu() +can avoid waiting on a given RCU read-side critical section only +if it can prove that synchronize_rcu() started first. +

@@QQE@@ + +

@@QQ@@ +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers really required? +

@@QQA@@ +Yes, they really are required. +To see why the first guarantee is required, consider the following +sequence of events: + +

    +
  1. CPU 1: rcu_read_lock() +
  2. CPU 1: q = rcu_dereference(gp); + /* Very likely to return p. */ +
  3. CPU 0: list_del_rcu(p); +
  4. CPU 0: synchronize_rcu() starts. +
  5. CPU 1: do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */ +
  6. CPU 1: rcu_read_unlock() +
  7. CPU 0: synchronize_rcu() returns. +
  8. CPU 0: kfree(p); +
+ +

+Therefore, there absolutely must be a full memory barrier between the +end of the RCU read-side critical section and the end of the +grace period. + +

+The sequence of events demonstrating the necessity of the second rule +is roughly similar: + +

    +
  1. CPU 0: list_del_rcu(p); +
  2. CPU 0: synchronize_rcu() starts. +
  3. CPU 1: rcu_read_lock() +
  4. CPU 1: q = rcu_dereference(gp); + /* Might return p if no memory barrier. */ +
  5. CPU 0: synchronize_rcu() returns. +
  6. CPU 0: kfree(p); +
  7. CPU 1: do_something_with(q->a); /* Boom!!! */ +
  8. CPU 1: rcu_read_unlock() +
+ +

+And similarly, without a memory barrier between the beginning of the +grace period and the beginning of the RCU read-side critical section, +CPU 1 might end up accessing the freelist. + +

+The “as if” rule of course applies, so that any implementation +that acts as if the appropriate memory barriers were in place is a +correct implementation. +That said, it is much easier to fool yourself into believing that you have +adhered to the as-if rule than it is to actually adhere to it! +

@@QQE@@ + +

+Note that these memory-barrier requirements do not replace the fundamental +RCU requirement that a grace period wait for all pre-existing readers. +On the contrary, the memory barriers called out in this section must operate in +such a way as to enforce this fundamental requirement. +Of course, different implementations enforce this requirement in different +ways, but enforce it they must. + +

RCU Primitives Guaranteed to Execute Unconditionally

+ +

+The common-case RCU primitives are unconditional. +They are invoked, they do their job, and they return, with no possibility +of error, and no need to retry. +This is a key RCU design philosophy. + +

+However, this philosophy is pragmatic rather than pigheaded. +If someone comes up with a good justification for a particular conditional +RCU primitive, it might well be implemented and added. +After all, this guarantee was reverse-engineered, not premeditated. +The unconditional nature of the RCU primitives was initially an +accident of implementation, and later experience with synchronization +primitives with conditional primitives caused me to elevate this +accident to a guarantee. +Therefore, the justification for adding a conditional primitive to +RCU would need to be based on detailed and compelling use cases. + +

Guaranteed Read-to-Write Upgrade

+ +

+As far as RCU is concerned, it is always possible to carry out an +update within an RCU read-side critical section. +For example, that RCU read-side critical section might search for +a given data element, and then might acquire the update-side +spinlock in order to update that element, all while remaining +in that RCU read-side critical section. +Of course, it is necessary to exit the RCU read-side critical section +before invoking synchronize_rcu(), however, this +inconvenience can be avoided through use of the +call_rcu() and kfree_rcu() API members +described later in this document. + +

@@QQ@@ +But how does the upgrade-to-write operation exclude other readers? +

@@QQA@@ +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. +

@@QQE@@ + +

+This guarantee allows lookup code to be shared between read-side +and update-side code, and was premeditated, appearing in the earliest +DYNIX/ptx RCU documentation. + +

Fundamental Non-Requirements

+ +

+RCU provides extremely lightweight readers, and its read-side guarantees, +though quite useful, are correspondingly lightweight. +It is therefore all too easy to assume that RCU is guaranteeing more +than it really is. +Of course, the list of things that RCU does not guarantee is infinitely +long, however, the following sections list a few non-guarantees that +have caused confusion. +Except where otherwise noted, these non-guarantees were premeditated. + +

    +
  1. + Readers Impose Minimal Ordering +
  2. + Readers Do Not Exclude Updaters +
  3. + Updaters Only Wait For Old Readers +
  4. + Grace Periods Don't Partition Read-Side Critical Sections +
  5. + Read-Side Critical Sections Don't Partition Grace Periods +
  6. + Disabling Preemption Does Not Block Grace Periods +
+ +

Readers Impose Minimal Ordering

+ +

+Reader-side markers such as rcu_read_lock() and +rcu_read_unlock() provide absolutely no ordering guarantees +except through their interaction with the grace-period APIs such as +synchronize_rcu(). +To see this, consider the following pair of threads: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(x, 1);
+ 5   rcu_read_unlock();
+ 6   rcu_read_lock();
+ 7   WRITE_ONCE(y, 1);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   rcu_read_lock();
+14   r1 = READ_ONCE(y);
+15   rcu_read_unlock();
+16   rcu_read_lock();
+17   r2 = READ_ONCE(x);
+18   rcu_read_unlock();
+19 }
+
+
+ +

+After thread0() and thread1() execute +concurrently, it is quite possible to have + +

+
+(r1 == 1 && r2 == 0)
+
+
+ +(that is, y appears to have been assigned before x), +which would not be possible if rcu_read_lock() and +rcu_read_unlock() had much in the way of ordering +properties. +But they do not, so the CPU is within its rights +to do significant reordering. +This is by design: Any significant ordering constraints would slow down +these fast-path APIs. + +

@@QQ@@ +Can't the compiler also reorder this code? +

@@QQA@@ +No, the volatile casts in READ_ONCE() and +WRITE_ONCE() prevent the compiler from reordering in +this particular case. +

@@QQE@@ + +

Readers Do Not Exclude Updaters

+ +

+Neither rcu_read_lock() nor rcu_read_unlock() +exclude updates. +All they do is to prevent grace periods from ending. +The following example illustrates this: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   r1 = READ_ONCE(y);
+ 5   if (r1) {
+ 6     do_something_with_nonzero_x();
+ 7     r2 = READ_ONCE(x);
+ 8     WARN_ON(!r2); /* BUG!!! */
+ 9   }
+10   rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15   spin_lock(&my_lock);
+16   WRITE_ONCE(x, 1);
+17   WRITE_ONCE(y, 1);
+18   spin_unlock(&my_lock);
+19 }
+
+
+ +

+If the thread0() function's rcu_read_lock() +excluded the thread1() function's update, +the WARN_ON() could never fire. +But the fact is that rcu_read_lock() does not exclude +much of anything aside from subsequent grace periods, of which +thread1() has none, so the +WARN_ON() can and does fire. + +

Updaters Only Wait For Old Readers

+ +

+It might be tempting to assume that after synchronize_rcu() +completes, there are no readers executing. +This temptation must be avoided because +new readers can start immediately after synchronize_rcu() +starts, and synchronize_rcu() is under no +obligation to wait for these new readers. + +

@@QQ@@ +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? +

@@QQA@@ +No. +Even if synchronize_rcu() were to wait until +all readers had completed, a new reader might start immediately after +synchronize_rcu() completed. +Therefore, the code following +synchronize_rcu() cannot rely on there being no readers +in any case. +

@@QQE@@ + +

+Grace Periods Don't Partition Read-Side Critical Sections

+ +

+It is tempting to assume that if any part of one RCU read-side critical +section precedes a given grace period, and if any part of another RCU +read-side critical section follows that same grace period, then all of +the first RCU read-side critical section must precede all of the second. +However, this just isn't the case: A single grace period does not +partition the set of RCU read-side critical sections. +An example of this situation can be illustrated as follows, where +x, y, and z are initially all zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   r2 = READ_ONCE(b);
+20   r3 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+
+
+ +

+It turns out that the outcome: + +

+
+(r1 == 1 && r2 == 0 && r3 == 1)
+
+
+ +is entirely possible. +The following figure show how this can happen, with each circled +QS indicating the point at which RCU recorded a +quiescent state for each thread, that is, a state in which +RCU knows that the thread cannot be in the midst of an RCU read-side +critical section that started before the current grace period: + +

GPpartitionReaders1.svg

+ +

+If it is necessary to partition RCU read-side critical sections in this +manner, it is necessary to use two grace periods, where the first +grace period is known to end before the second grace period starts: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   r2 = READ_ONCE(c);
+19   synchronize_rcu();
+20   WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25   rcu_read_lock();
+26   r3 = READ_ONCE(b);
+27   r4 = READ_ONCE(d);
+28   rcu_read_unlock();
+29 }
+
+
+ +

+Here, if (r1 == 1), then +thread0()'s write to b must happen +before the end of thread1()'s grace period. +If in addition (r4 == 1), then +thread3()'s read from b must happen +after the beginning of thread2()'s grace period. +If it is also the case that (r2 == 1), then the +end of thread1()'s grace period must precede the +beginning of thread2()'s grace period. +This mean that the two RCU read-side critical sections cannot overlap, +guaranteeing that (r3 == 1). +As a result, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
+
+
+ +cannot happen. + +

+This non-requirement was also non-premeditated, but became apparent +when studying RCU's interaction with memory ordering. + +

+Read-Side Critical Sections Don't Partition Grace Periods

+ +

+It is also tempting to assume that if an RCU read-side critical section +happens between a pair of grace periods, then those grace periods cannot +overlap. +However, this temptation leads nowhere good, as can be illustrated by +the following, with all variables initially zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   WRITE_ONCE(d, 1);
+20   r2 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26   r3 = READ_ONCE(d);
+27   synchronize_rcu();
+28   WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33   rcu_read_lock();
+34   r4 = READ_ONCE(b);
+35   r5 = READ_ONCE(e);
+36   rcu_read_unlock();
+37 }
+
+
+ +

+In this case, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
+
+
+ +is entirely possible, as illustrated below: + +

ReadersPartitionGP1.svg

+ +

+Again, an RCU read-side critical section can overlap almost all of a +given grace period, just so long as it does not overlap the entire +grace period. +As a result, an RCU read-side critical section cannot partition a pair +of RCU grace periods. + +

@@QQ@@ +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? +

@@QQA@@ +In theory, an infinite number. +In practice, an unknown number that is sensitive to both implementation +details and timing considerations. +Therefore, even in practice, RCU users must abide by the theoretical rather +than the practical answer. +

@@QQE@@ + +

+Disabling Preemption Does Not Block Grace Periods

+ +

+There was a time when disabling preemption on any given CPU would block +subsequent grace periods. +However, this was an accident of implementation and is not a requirement. +And in the current Linux-kernel implementation, disabling preemption +on a given CPU in fact does not block grace periods, as Oleg Nesterov +demonstrated. + +

+If you need a preempt-disable region to block grace periods, you need to add +rcu_read_lock() and rcu_read_unlock(), for example +as follows: + +

+
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&mylock);
+
+
+ +

+In theory, you could enter the RCU read-side critical section first, +but it is more efficient to keep the entire RCU read-side critical +section contained in the preempt-disable region as shown above. +Of course, RCU read-side critical sections that extend outside of +preempt-disable regions will work correctly, but such critical sections +can be preempted, which forces rcu_read_unlock() to do +more work. +And no, this is not an invitation to enclose all of your RCU +read-side critical sections within preempt-disable regions, because +doing so would degrade real-time response. + +

+This non-requirement appeared with preemptible RCU. +If you need a grace period that waits on non-preemptible code regions, use +RCU-sched. + +

Parallelism Facts of Life

+ +

+These parallelism facts of life are by no means specific to RCU, but +the RCU implementation must abide by them. +They therefore bear repeating: + +

    +
  1. Any CPU or task may be delayed at any time, + and any attempts to avoid these delays by disabling + preemption, interrupts, or whatever are completely futile. + This is most obvious in preemptible user-level + environments and in virtualized environments (where + a given guest OS's VCPUs can be preempted at any time by + the underlying hypervisor), but can also happen in bare-metal + environments due to ECC errors, NMIs, and other hardware + events. + Although a delay of more than about 20 seconds can result + in splats, the RCU implementation is obligated to use + algorithms that can tolerate extremely long delays, but where + “extremely long” is not long enough to allow + wrap-around when incrementing a 64-bit counter. +
  2. Both the compiler and the CPU can reorder memory accesses. + Where it matters, RCU must use compiler directives and + memory-barrier instructions to preserve ordering. +
  3. Conflicting writes to memory locations in any given cache line + will result in expensive cache misses. + Greater numbers of concurrent writes and more-frequent + concurrent writes will result in more dramatic slowdowns. + RCU is therefore obligated to use algorithms that have + sufficient locality to avoid significant performance and + scalability problems. +
  4. As a rough rule of thumb, only one CPU's worth of processing + may be carried out under the protection of any given exclusive + lock. + RCU must therefore use scalable locking designs. +
  5. Counters are finite, especially on 32-bit systems. + RCU's use of counters must therefore tolerate counter wrap, + or be designed such that counter wrap would take way more + time than a single system is likely to run. + An uptime of ten years is quite possible, a runtime + of a century much less so. + As an example of the latter, RCU's dyntick-idle nesting counter + allows 54 bits for interrupt nesting level (this counter + is 64 bits even on a 32-bit system). + Overflowing this counter requires 254 + half-interrupts on a given CPU without that CPU ever going idle. + If a half-interrupt happened every microsecond, it would take + 570 years of runtime to overflow this counter, which is currently + believed to be an acceptably long time. +
  6. Linux systems can have thousands of CPUs running a single + Linux kernel in a single shared-memory environment. + RCU must therefore pay close attention to high-end scalability. +
+ +

+This last parallelism fact of life means that RCU must pay special +attention to the preceding facts of life. +The idea that Linux might scale to systems with thousands of CPUs would +have been met with some skepticism in the 1990s, but these requirements +would have otherwise have been unsurprising, even in the early 1990s. + +

Quality-of-Implementation Requirements

+ +

+These sections list quality-of-implementation requirements. +Although an RCU implementation that ignores these requirements could +still be used, it would likely be subject to limitations that would +make it inappropriate for industrial-strength production use. +Classes of quality-of-implementation requirements are as follows: + +

    +
  1. Specialization +
  2. Performance and Scalability +
  3. Composability +
  4. Corner Cases +
+ +

+These classes is covered in the following sections. + +

Specialization

+ +

+RCU is and always has been intended primarily for read-mostly situations, as +illustrated by the following figure. +This means that RCU's read-side primitives are optimized, often at the +expense of its update-side primitives. + +

RCUApplicability.svg

+ +

+This focus on read-mostly situations means that RCU must interoperate +with other synchronization primitives. +For example, the add_gp() and remove_gp_synchronous() +examples discussed earlier use RCU to protect readers and locking to +coordinate updaters. +However, the need extends much farther, requiring that a variety of +synchronization primitives be legal within RCU read-side critical sections, +including spinlocks, sequence locks, atomic operations, reference +counters, and memory barriers. + +

@@QQ@@ +What about sleeping locks? +

@@QQA@@ +These are forbidden within Linux-kernel RCU read-side critical sections +because it is not legal to place a quiescent state (in this case, +voluntary context switch) within an RCU read-side critical section. +However, sleeping locks may be used within userspace RCU read-side critical +sections, and also within Linux-kernel sleepable RCU +(SRCU) +read-side critical sections. +In addition, the -rt patchset turns spinlocks into a sleeping locks so +that the corresponding critical sections can be preempted, which +also means that these sleeplockified spinlocks (but not other sleeping locks!) +may be acquire within -rt-Linux-kernel RCU read-side critical sections. + +

+Note that it is legal for a normal RCU read-side critical section +to conditionally acquire a sleeping locks (as in mutex_trylock()), +but only as long as it does not loop indefinitely attempting to +conditionally acquire that sleeping locks. +The key point is that things like mutex_trylock() +either return with the mutex held, or return an error indication if +the mutex was not immediately available. +Either way, mutex_trylock() returns immediately without sleeping. +

@@QQE@@ + +

+It often comes as a surprise that many algorithms do not require a +consistent view of data, but many can function in that mode, +with network routing being the poster child. +Internet routing algorithms take significant time to propagate +updates, so that by the time an update arrives at a given system, +that system has been sending network traffic the wrong way for +a considerable length of time. +Having a few threads continue to send traffic the wrong way for a +few more milliseconds is clearly not a problem: In the worst case, +TCP retransmissions will eventually get the data where it needs to go. +In general, when tracking the state of the universe outside of the +computer, some level of inconsistency must be tolerated due to +speed-of-light delays if nothing else. + +

+Furthermore, uncertainty about external state is inherent in many cases. +For example, a pair of veternarians might use heartbeat to determine +whether or not a given cat was alive. +But how long should they wait after the last heartbeat to decide that +the cat is in fact dead? +Waiting less than 400 milliseconds makes no sense because this would +mean that a relaxed cat would be considered to cycle between death +and life more than 100 times per minute. +Moreover, just as with human beings, a cat's heart might stop for +some period of time, so the exact wait period is a judgment call. +One of our pair of veternarians might wait 30 seconds before pronouncing +the cat dead, while the other might insist on waiting a full minute. +The two veternarians would then disagree on the state of the cat during +the final 30 seconds of the minute following the last heartbeat, as +fancifully illustrated below: + +

2013-08-is-it-dead.png

+ +

+Interestingly enough, this same situation applies to hardware. +When push comes to shove, how do we tell whether or not some +external server has failed? +We send messages to it periodically, and declare it failed if we +don't receive a response within a given period of time. +Policy decisions can usually tolerate short +periods of inconsistency. +The policy was decided some time ago, and is only now being put into +effect, so a few milliseconds of delay is normally inconsequential. + +

+However, there are algorithms that absolutely must see consistent data. +For example, the translation between a user-level SystemV semaphore +ID to the corresponding in-kernel data structure is protected by RCU, +but it is absolutely forbidden to update a semaphore that has just been +removed. +In the Linux kernel, this need for consistency is accommodated by acquiring +spinlocks located in the in-kernel data structure from within +the RCU read-side critical section, and this is indicated by the +green box in the figure above. +Many other techniques may be used, and are in fact used within the +Linux kernel. + +

+In short, RCU is not required to maintain consistency, and other +mechanisms may be used in concert with RCU when consistency is required. +RCU's specialization allows it to do its job extremely well, and its +ability to interoperate with other synchronization mechanisms allows +the right mix of synchronization tools to be used for a given job. + +

Performance and Scalability

+ +

+Energy efficiency is a critical component of performance today, +and Linux-kernel RCU implementations must therefore avoid unnecessarily +awakening idle CPUs. +I cannot claim that this requirement was premeditated. +In fact, I learned of it during a telephone conversation in which I +was given “frank and open” feedback on the importance +of energy efficiency in battery-powered systems and on specific +energy-efficiency shortcomings of the Linux-kernel RCU implementation. +In my experience, the battery-powered embedded community will consider +any unnecessary wakeups to be extremely unfriendly acts. +So much so that mere Linux-kernel-mailing-list posts are +insufficient to vent their ire. + +

+Memory consumption is not particularly important for in most +situations, and has become decreasingly +so as memory sizes have expanded and memory +costs have plummeted. +However, as I learned from Matt Mackall's +bloatwatch +efforts, memory footprint is critically important on single-CPU systems with +non-preemptible (CONFIG_PREEMPT=n) kernels, and thus +tiny RCU +was born. +Josh Triplett has since taken over the small-memory banner with his +Linux kernel tinification +project, which resulted in +SRCU +becoming optional for those kernels not needing it. + +

+The remaining performance requirements are, for the most part, +unsurprising. +For example, in keeping with RCU's read-side specialization, +rcu_dereference() should have negligible overhead (for +example, suppression of a few minor compiler optimizations). +Similarly, in non-preemptible environments, rcu_read_lock() and +rcu_read_unlock() should have exactly zero overhead. + +

+In preemptible environments, in the case where the RCU read-side +critical section was not preempted (as will be the case for the +highest-priority real-time process), rcu_read_lock() and +rcu_read_unlock() should have minimal overhead. +In particular, they should not contain atomic read-modify-write +operations, memory-barrier instructions, preemption disabling, +interrupt disabling, or backwards branches. +However, in the case where the RCU read-side critical section was preempted, +rcu_read_unlock() may acquire spinlocks and disable interrupts. +This is why it is better to nest an RCU read-side critical section +within a preempt-disable region than vice versa, at least in cases +where that critical section is short enough to avoid unduly degrading +real-time latencies. + +

+The synchronize_rcu() grace-period-wait primitive is +optimized for throughput. +It may therefore incur several milliseconds of latency in addition to +the duration of the longest RCU read-side critical section. +On the other hand, multiple concurrent invocations of +synchronize_rcu() are required to use batching optimizations +so that they can be satisfied by a single underlying grace-period-wait +operation. +For example, in the Linux kernel, it is not unusual for a single +grace-period-wait operation to serve more than +1,000 separate invocations +of synchronize_rcu(), thus amortizing the per-invocation +overhead down to nearly zero. +However, the grace-period optimization is also required to avoid +measurable degradation of real-time scheduling and interrupt latencies. + +

+In some cases, the multi-millisecond synchronize_rcu() +latencies are unacceptable. +In these cases, synchronize_rcu_expedited() may be used +instead, reducing the grace-period latency down to a few tens of +microseconds on small systems, at least in cases where the RCU read-side +critical sections are short. +There are currently no special latency requirements for +synchronize_rcu_expedited() on large systems, but, +consistent with the empirical nature of the RCU specification, +that is subject to change. +However, there most definitely are scalability requirements: +A storm of synchronize_rcu_expedited() invocations on 4096 +CPUs should at least make reasonable forward progress. +In return for its shorter latencies, synchronize_rcu_expedited() +is permitted to impose modest degradation of real-time latency +on non-idle online CPUs. +That said, it will likely be necessary to take further steps to reduce this +degradation, hopefully to roughly that of a scheduling-clock interrupt. + +

+There are a number of situations where even +synchronize_rcu_expedited()'s reduced grace-period +latency is unacceptable. +In these situations, the asynchronous call_rcu() can be +used in place of synchronize_rcu() as follows: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9   struct foo *p = container_of(rhp, struct foo, rh);
+10
+11   kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16   struct foo *p;
+17
+18   spin_lock(&gp_lock);
+19   p = rcu_dereference(gp);
+20   if (!p) {
+21     spin_unlock(&gp_lock);
+22     return false;
+23   }
+24   rcu_assign_pointer(gp, NULL);
+25   call_rcu(&p->rh, remove_gp_cb);
+26   spin_unlock(&gp_lock);
+27   return true;
+28 }
+
+
+ +

+A definition of struct foo is finally needed, and appears +on lines 1-5. +The function remove_gp_cb() is passed to call_rcu() +on line 25, and will be invoked after the end of a subsequent +grace period. +This gets the same effect as remove_gp_synchronous(), +but without forcing the updater to wait for a grace period to elapse. +The call_rcu() function may be used in a number of +situations where neither synchronize_rcu() nor +synchronize_rcu_expedited() would be legal, +including within preempt-disable code, local_bh_disable() code, +interrupt-disable code, and interrupt handlers. +However, even call_rcu() is illegal within NMI handlers. +The callback function (remove_gp_cb() in this case) will be +executed within softirq (software interrupt) environment within the +Linux kernel, +either within a real softirq handler or under the protection +of local_bh_disable(). +In both the Linux kernel and in userspace, it is bad practice to +write an RCU callback function that takes too long. +Long-running operations should be relegated to separate threads or +(in the Linux kernel) workqueues. + +

@@QQ@@ +Why does line 19 use rcu_access_pointer()? +After all, call_rcu() on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that rcu_dereference() is required? +

@@QQA@@ +Presumably the ->gp_lock acquired on line 18 excludes +any changes, including any insertions that rcu_dereference() +would protect against. +Therefore, any insertions will be delayed until after ->gp_lock +is released on line 25, which in turn means that +rcu_access_pointer() suffices. +

@@QQE@@ + +

+However, all that remove_gp_cb() is doing is +invoking kfree() on the data element. +This is a common idiom, and is supported by kfree_rcu(), +which allows “fire and forget” operation as shown below: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9   struct foo *p;
+10
+11   spin_lock(&gp_lock);
+12   p = rcu_dereference(gp);
+13   if (!p) {
+14     spin_unlock(&gp_lock);
+15     return false;
+16   }
+17   rcu_assign_pointer(gp, NULL);
+18   kfree_rcu(p, rh);
+19   spin_unlock(&gp_lock);
+20   return true;
+21 }
+
+
+ +

+Note that remove_gp_faf() simply invokes +kfree_rcu() and proceeds, without any need to pay any +further attention to the subsequent grace period and kfree(). +It is permissible to invoke kfree_rcu() from the same +environments as for call_rcu(). +Interestingly enough, DYNIX/ptx had the equivalents of +call_rcu() and kfree_rcu(), but not +synchronize_rcu(). +This was due to the fact that RCU was not heavily used within DYNIX/ptx, +so the very few places that needed something like +synchronize_rcu() simply open-coded it. + +

@@QQ@@ +Earlier it was claimed that call_rcu() and +kfree_rcu() allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? +

@@QQA@@ +We could define things this way, but keep in mind that this sort of +definition would say that updates in garbage-collected languages +cannot complete until the next time the garbage collector runs, +which does not seem at all reasonable. +The key point is that in most cases, an updater using either +call_rcu() or kfree_rcu() can proceed to the +next update as soon as it has invoked call_rcu() or +kfree_rcu(), without having to wait for a subsequent +grace period. +

@@QQE@@ + +

+But what if the updater must wait for the completion of code to be +executed after the end of the grace period, but has other tasks +that can be carried out in the meantime? +The polling-style get_state_synchronize_rcu() and +cond_synchronize_rcu() functions may be used for this +purpose, as shown below: + +

+
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3   struct foo *p;
+ 4   unsigned long s;
+ 5
+ 6   spin_lock(&gp_lock);
+ 7   p = rcu_access_pointer(gp);
+ 8   if (!p) {
+ 9     spin_unlock(&gp_lock);
+10     return false;
+11   }
+12   rcu_assign_pointer(gp, NULL);
+13   spin_unlock(&gp_lock);
+14   s = get_state_synchronize_rcu();
+15   do_something_while_waiting();
+16   cond_synchronize_rcu(s);
+17   kfree(p);
+18   return true;
+19 }
+
+
+ +

+On line 14, get_state_synchronize_rcu() obtains a +“cookie” from RCU, +then line 15 carries out other tasks, +and finally, line 16 returns immediately if a grace period has +elapsed in the meantime, but otherwise waits as required. +The need for get_state_synchronize_rcu and +cond_synchronize_rcu() has appeared quite recently, +so it is too early to tell whether they will stand the test of time. + +

+RCU thus provides a range of tools to allow updaters to strike the +required tradeoff between latency, flexibility and CPU overhead. + +

Composability

+ +

+Composability has received much attention in recent years, perhaps in part +due to the collision of multicore hardware with object-oriented techniques +designed in single-threaded environments for single-threaded use. +And in theory, RCU read-side critical sections may be composed, and in +fact may be nested arbitrarily deeply. +In practice, as with all real-world implementations of composable +constructs, there are limitations. + +

+Implementations of RCU for which rcu_read_lock() +and rcu_read_unlock() generate no code, such as +Linux-kernel RCU when CONFIG_PREEMPT=n, can be +nested arbitrarily deeply. +After all, there is no overhead. +Except that if all these instances of rcu_read_lock() +and rcu_read_unlock() are visible to the compiler, +compilation will eventually fail due to exhausting memory, +mass storage, or user patience, whichever comes first. +If the nesting is not visible to the compiler, as is the case with +mutually recursive functions each in its own translation unit, +stack overflow will result. +If the nesting takes the form of loops, either the control variable +will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. +Nevertheless, this class of RCU implementations is one +of the most composable constructs in existence. + +

+RCU implementations that explicitly track nesting depth +are limited by the nesting-depth counter. +For example, the Linux kernel's preemptible RCU limits nesting to +INT_MAX. +This should suffice for almost all practical purposes. +That said, a consecutive pair of RCU read-side critical sections +between which there is an operation that waits for a grace period +cannot be enclosed in another RCU read-side critical section. +This is because it is not legal to wait for a grace period within +an RCU read-side critical section: To do so would result either +in deadlock or +in RCU implicitly splitting the enclosing RCU read-side critical +section, neither of which is conducive to a long-lived and prosperous +kernel. + +

+It is worth noting that RCU is not alone in limiting composability. +For example, many transactional-memory implementations prohibit +composing a pair of transactions separated by an irrevocable +operation (for example, a network receive operation). +For another example, lock-based critical sections can be composed +surprisingly freely, but only if deadlock is avoided. + +

+In short, although RCU read-side critical sections are highly composable, +care is required in some situations, just as is the case for any other +composable synchronization mechanism. + +

Corner Cases

+ +

+A given RCU workload might have an endless and intense stream of +RCU read-side critical sections, perhaps even so intense that there +was never a point in time during which there was not at least one +RCU read-side critical section in flight. +RCU cannot allow this situation to block grace periods: As long as +all the RCU read-side critical sections are finite, grace periods +must also be finite. + +

+That said, preemptible RCU implementations could potentially result +in RCU read-side critical sections being preempted for long durations, +which has the effect of creating a long-duration RCU read-side +critical section. +This situation can arise only in heavily loaded systems, but systems using +real-time priorities are of course more vulnerable. +Therefore, RCU priority boosting is provided to help deal with this +case. +That said, the exact requirements on RCU priority boosting will likely +evolve as more experience accumulates. + +

+Other workloads might have very high update rates. +Although one can argue that such workloads should instead use +something other than RCU, the fact remains that RCU must +handle such workloads gracefully. +This requirement is another factor driving batching of grace periods, +but it is also the driving force behind the checks for large numbers +of queued RCU callbacks in the call_rcu() code path. +Finally, high update rates should not delay RCU read-side critical +sections, although some read-side delays can occur when using +synchronize_rcu_expedited(), courtesy of this function's use +of try_stop_cpus(). +(In the future, synchronize_rcu_expedited() will be +converted to use lighter-weight inter-processor interrupts (IPIs), +but this will still disturb readers, though to a much smaller degree.) + +

+Although all three of these corner cases were understood in the early +1990s, a simple user-level test consisting of close(open(path)) +in a tight loop +in the early 2000s suddenly provided a much deeper appreciation of the +high-update-rate corner case. +This test also motivated addition of some RCU code to react to high update +rates, for example, if a given CPU finds itself with more than 10,000 +RCU callbacks queued, it will cause RCU to take evasive action by +more aggressively starting grace periods and more aggressively forcing +completion of grace-period processing. +This evasive action causes the grace period to complete more quickly, +but at the cost of restricting RCU's batching optimizations, thus +increasing the CPU overhead incurred by that grace period. + +

+Software-Engineering Requirements

+ +

+Between Murphy's Law and “To err is human”, it is necessary to +guard against mishaps and misuse: + +

    +
  1. It is all too easy to forget to use rcu_read_lock() + everywhere that it is needed, so kernels built with + CONFIG_PROVE_RCU=y will spat if + rcu_dereference() is used outside of an + RCU read-side critical section. + Update-side code can use rcu_dereference_protected(), + which takes a + lockdep expression + to indicate what is providing the protection. + If the indicated protection is not provided, a lockdep splat + is emitted. + +

    + Code shared between readers and updaters can use + rcu_dereference_check(), which also takes a + lockdep expression, and emits a lockdep splat if neither + rcu_read_lock() nor the indicated protection + is in place. + In addition, rcu_dereference_raw() is used in those + (hopefully rare) cases where the required protection cannot + be easily described. + Finally, rcu_read_lock_held() is provided to + allow a function to verify that it has been invoked within + an RCU read-side critical section. + I was made aware of this set of requirements shortly after Thomas + Gleixner audited a number of RCU uses. +

  2. A given function might wish to check for RCU-related preconditions + upon entry, before using any other RCU API. + The rcu_lockdep_assert() does this job, + asserting the expression in kernels having lockdep enabled + and doing nothing otherwise. +
  3. It is also easy to forget to use rcu_assign_pointer() + and rcu_dereference(), perhaps (incorrectly) + substituting a simple assignment. + To catch this sort of error, a given RCU-protected pointer may be + tagged with __rcu, after which running sparse + with CONFIG_SPARSE_RCU_POINTER=y will complain + about simple-assignment accesses to that pointer. + Arnd Bergmann made me aware of this requirement, and also + supplied the needed + patch series. +
  4. Kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y + will splat if a data element is passed to call_rcu() + twice in a row, without a grace period in between. + (This error is similar to a double free.) + The corresponding rcu_head structures that are + dynamically allocated are automatically tracked, but + rcu_head structures allocated on the stack + must be initialized with init_rcu_head_on_stack() + and cleaned up with destroy_rcu_head_on_stack(). + Similarly, statically allocated non-stack rcu_head + structures must be initialized with init_rcu_head() + and cleaned up with destroy_rcu_head(). + Mathieu Desnoyers made me aware of this requirement, and also + supplied the needed + patch. +
  5. An infinite loop in an RCU read-side critical section will + eventually trigger an RCU CPU stall warning splat, with + the duration of “eventually” being controlled by the + RCU_CPU_STALL_TIMEOUT Kconfig option, or, + alternatively, by the + rcupdate.rcu_cpu_stall_timeout boot/sysfs + parameter. + However, RCU is not obligated to produce this splat + unless there is a grace period waiting on that particular + RCU read-side critical section. +

    + Some extreme workloads might intentionally delay + RCU grace periods, and systems running those workloads can + be booted with rcupdate.rcu_cpu_stall_suppress + to suppress the splats. + This kernel parameter may also be set via sysfs. + Furthermore, RCU CPU stall warnings are counter-productive + during sysrq dumps and during panics. + RCU therefore supplies the rcu_sysrq_start() and + rcu_sysrq_end() API members to be called before + and after long sysrq dumps. + RCU also supplies the rcu_panic() notifier that is + automatically invoked at the beginning of a panic to suppress + further RCU CPU stall warnings. + +

    + This requirement made itself known in the early 1990s, pretty + much the first time that it was necessary to debug a CPU stall. + That said, the initial implementation in DYNIX/ptx was quite + generic in comparison with that of Linux. +

  6. Although it would be very good to detect pointers leaking out + of RCU read-side critical sections, there is currently no + good way of doing this. + One complication is the need to distinguish between pointers + leaking and pointers that have been handed off from RCU to + some other synchronization mechanism, for example, reference + counting. +
  7. In kernels built with CONFIG_RCU_TRACE=y, RCU-related + information is provided via both debugfs and event tracing. +
  8. Open-coded use of rcu_assign_pointer() and + rcu_dereference() to create typical linked + data structures can be surprisingly error-prone. + Therefore, RCU-protected + linked lists + and, more recently, RCU-protected + hash tables + are available. + Many other special-purpose RCU-protected data structures are + available in the Linux kernel and the userspace RCU library. +
  9. Some linked structures are created at compile time, but still + require __rcu checking. + The RCU_POINTER_INITIALIZER() macro serves this + purpose. +
  10. It is not necessary to use rcu_assign_pointer() + when creating linked structures that are to be published via + a single external pointer. + The RCU_INIT_POINTER() macro is provided for + this task and also for assigning NULL pointers + at runtime. +
+ +

+This not a hard-and-fast list: RCU's diagnostic capabilities will +continue to be guided by the number and type of usage bugs found +in real-world RCU usage. + +

Linux Kernel Complications

+ +

+The Linux kernel provides an interesting environment for all kinds of +software, including RCU. +Some of the relevant points of interest are as follows: + +

    +
  1. Configuration. +
  2. Firmware Interface. +
  3. Early Boot. +
  4. + Interrupts and non-maskable interrupts (NMIs). +
  5. Loadable Modules. +
  6. Hotplug CPU. +
  7. Scheduler and RCU. +
  8. Tracing and RCU. +
  9. Energy Efficiency. +
  10. Memory Efficiency. +
  11. + Performance, Scalability, Response Time, and Reliability. +
+ +

+This list is probably incomplete, but it does give a feel for the +most notable Linux-kernel complications. +Each of the following sections covers one of the above topics. + +

Configuration

+ +

+RCU's goal is automatic configuration, so that almost nobody +needs to worry about RCU's Kconfig options. +And for almost all users, RCU does in fact work well +“out of the box.” + +

+However, there are specialized use cases that are handled by +kernel boot parameters and Kconfig options. +Unfortunately, the Kconfig system will explicitly ask users +about new Kconfig options, which requires almost all of them +be hidden behind a CONFIG_RCU_EXPERT Kconfig option. + +

+This all should be quite obvious, but the fact remains that +Linus Torvalds recently had to +remind +me of this requirement. + +

Firmware Interface

+ +

+In many cases, kernel obtains information about the system from the +firmware, and sometimes things are lost in translation. +Or the translation is accurate, but the original message is bogus. + +

+For example, some systems' firmware overreports the number of CPUs, +sometimes by a large factor. +If RCU naively believed the firmware, as it used to do, +it would create too many per-CPU kthreads. +Although the resulting system will still run correctly, the extra +kthreads needlessly consume memory and can cause confusion +when they show up in ps listings. + +

+RCU must therefore wait for a given CPU to actually come online before +it can allow itself to believe that the CPU actually exists. +The resulting “ghost CPUs” (which are never going to +come online) cause a number of +interesting complications. + +

Early Boot

+ +

+The Linux kernel's boot sequence is an interesting process, +and RCU is used early, even before rcu_init() +is invoked. +In fact, a number of RCU's primitives can be used as soon as the +initial task's task_struct is available and the +boot CPU's per-CPU variables are set up. +The read-side primitives (rcu_read_lock(), +rcu_read_unlock(), rcu_dereference(), +and rcu_access_pointer()) will operate normally very early on, +as will rcu_assign_pointer(). + +

+Although call_rcu() may be invoked at any +time during boot, callbacks are not guaranteed to be invoked until after +the scheduler is fully up and running. +This delay in callback invocation is due to the fact that RCU does not +invoke callbacks until it is fully initialized, and this full initialization +cannot occur until after the scheduler has initialized itself to the +point where RCU can spawn and run its kthreads. +In theory, it would be possible to invoke callbacks earlier, +however, this is not a panacea because there would be severe restrictions +on what operations those callbacks could invoke. + +

+Perhaps surprisingly, synchronize_rcu(), +synchronize_rcu_bh() +(discussed below), +and +synchronize_sched() +will all operate normally +during very early boot, the reason being that there is only one CPU +and preemption is disabled. +This means that the call synchronize_rcu() (or friends) +itself is a quiescent +state and thus a grace period, so the early-boot implementation can +be a no-op. + +

+Both synchronize_rcu_bh() and synchronize_sched() +continue to operate normally through the remainder of boot, courtesy +of the fact that preemption is disabled across their RCU read-side +critical sections and also courtesy of the fact that there is still +only one CPU. +However, once the scheduler starts initializing, preemption is enabled. +There is still only a single CPU, but the fact that preemption is enabled +means that the no-op implementation of synchronize_rcu() no +longer works in CONFIG_PREEMPT=y kernels. +Therefore, as soon as the scheduler starts initializing, the early-boot +fastpath is disabled. +This means that synchronize_rcu() switches to its runtime +mode of operation where it posts callbacks, which in turn means that +any call to synchronize_rcu() will block until the corresponding +callback is invoked. +Unfortunately, the callback cannot be invoked until RCU's runtime +grace-period machinery is up and running, which cannot happen until +the scheduler has initialized itself sufficiently to allow RCU's +kthreads to be spawned. +Therefore, invoking synchronize_rcu() during scheduler +initialization can result in deadlock. + +

@@QQ@@ +So what happens with synchronize_rcu() during +scheduler initialization for CONFIG_PREEMPT=n +kernels? +

@@QQA@@ +In CONFIG_PREEMPT=n kernel, synchronize_rcu() +maps directly to synchronize_sched(). +Therefore, synchronize_rcu() works normally throughout +boot in CONFIG_PREEMPT=n kernels. +However, your code must also work in CONFIG_PREEMPT=y kernels, +so it is still necessary to avoid invoking synchronize_rcu() +during scheduler initialization. +

@@QQE@@ + +

+I learned of these boot-time requirements as a result of a series of +system hangs. + +

Interrupts and NMIs

+ +

+The Linux kernel has interrupts, and RCU read-side critical sections are +legal within interrupt handlers and within interrupt-disabled regions +of code, as are invocations of call_rcu(). + +

+Some Linux-kernel architectures can enter an interrupt handler from +non-idle process context, and then just never leave it, instead stealthily +transitioning back to process context. +This trick is sometimes used to invoke system calls from inside the kernel. +These “half-interrupts” mean that RCU has to be very careful +about how it counts interrupt nesting levels. +I learned of this requirement the hard way during a rewrite +of RCU's dyntick-idle code. + +

+The Linux kernel has non-maskable interrupts (NMIs), and +RCU read-side critical sections are legal within NMI handlers. +Thankfully, RCU update-side primitives, including +call_rcu(), are prohibited within NMI handlers. + +

+The name notwithstanding, some Linux-kernel architectures +can have nested NMIs, which RCU must handle correctly. +Andy Lutomirski +surprised me +with this requirement; +he also kindly surprised me with +an algorithm +that meets this requirement. + +

Loadable Modules

+ +

+The Linux kernel has loadable modules, and these modules can +also be unloaded. +After a given module has been unloaded, any attempt to call +one of its functions results in a segmentation fault. +The module-unload functions must therefore cancel any +delayed calls to loadable-module functions, for example, +any outstanding mod_timer() must be dealt with +via del_timer_sync() or similar. + +

+Unfortunately, there is no way to cancel an RCU callback; +once you invoke call_rcu(), the callback function is +going to eventually be invoked, unless the system goes down first. +Because it is normally considered socially irresponsible to crash the system +in response to a module unload request, we need some other way +to deal with in-flight RCU callbacks. + +

+RCU therefore provides +rcu_barrier(), +which waits until all in-flight RCU callbacks have been invoked. +If a module uses call_rcu(), its exit function should therefore +prevent any future invocation of call_rcu(), then invoke +rcu_barrier(). +In theory, the underlying module-unload code could invoke +rcu_barrier() unconditionally, but in practice this would +incur unacceptable latencies. + +

+Nikita Danilov noted this requirement for an analogous filesystem-unmount +situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. +The need for rcu_barrier() for module unloading became +apparent later. + +

Hotplug CPU

+ +

+The Linux kernel supports CPU hotplug, which means that CPUs +can come and go. +It is of course illegal to use any RCU API member from an offline CPU. +This requirement was present from day one in DYNIX/ptx, but +on the other hand, the Linux kernel's CPU-hotplug implementation +is “interesting.” + +

+The Linux-kernel CPU-hotplug implementation has notifiers that +are used to allow the various kernel subsystems (including RCU) +to respond appropriately to a given CPU-hotplug operation. +Most RCU operations may be invoked from CPU-hotplug notifiers, +including even normal synchronous grace-period operations +such as synchronize_rcu(). +However, expedited grace-period operations such as +synchronize_rcu_expedited() are not supported, +due to the fact that current implementations block CPU-hotplug +operations, which could result in deadlock. + +

+In addition, all-callback-wait operations such as +rcu_barrier() are also not supported, due to the +fact that there are phases of CPU-hotplug operations where +the outgoing CPU's callbacks will not be invoked until after +the CPU-hotplug operation ends, which could also result in deadlock. + +

Scheduler and RCU

+ +

+RCU depends on the scheduler, and the scheduler uses RCU to +protect some of its data structures. +This means the scheduler is forbidden from acquiring +the runqueue locks and the priority-inheritance locks +in the middle of an outermost RCU read-side critical section unless either +(1) it releases them before exiting that same +RCU read-side critical section, or +(2) interrupts are disabled across +that entire RCU read-side critical section. +This same prohibition also applies (recursively!) to any lock that is acquired +while holding any lock to which this prohibition applies. +Adhering to this rule prevents preemptible RCU from invoking +rcu_read_unlock_special() while either runqueue or +priority-inheritance locks are held, thus avoiding deadlock. + +

+Prior to v4.4, it was only necessary to disable preemption across +RCU read-side critical sections that acquired scheduler locks. +In v4.4, expedited grace periods started using IPIs, and these +IPIs could force a rcu_read_unlock() to take the slowpath. +Therefore, this expedited-grace-period change required disabling of +interrupts, not just preemption. + +

+For RCU's part, the preemptible-RCU rcu_read_unlock() +implementation must be written carefully to avoid similar deadlocks. +In particular, rcu_read_unlock() must tolerate an +interrupt where the interrupt handler invokes both +rcu_read_lock() and rcu_read_unlock(). +This possibility requires rcu_read_unlock() to use +negative nesting levels to avoid destructive recursion via +interrupt handler's use of RCU. + +

+This pair of mutual scheduler-RCU requirements came as a +complete surprise. + +

+As noted above, RCU makes use of kthreads, and it is necessary to +avoid excessive CPU-time accumulation by these kthreads. +This requirement was no surprise, but RCU's violation of it +when running context-switch-heavy workloads when built with +CONFIG_NO_HZ_FULL=y +did come as a surprise [PDF]. +RCU has made good progress towards meeting this requirement, even +for context-switch-have CONFIG_NO_HZ_FULL=y workloads, +but there is room for further improvement. + +

Tracing and RCU

+ +

+It is possible to use tracing on RCU code, but tracing itself +uses RCU. +For this reason, rcu_dereference_raw_notrace() +is provided for use by tracing, which avoids the destructive +recursion that could otherwise ensue. +This API is also used by virtualization in some architectures, +where RCU readers execute in environments in which tracing +cannot be used. +The tracing folks both located the requirement and provided the +needed fix, so this surprise requirement was relatively painless. + +

Energy Efficiency

+ +

+Interrupting idle CPUs is considered socially unacceptable, +especially by people with battery-powered embedded systems. +RCU therefore conserves energy by detecting which CPUs are +idle, including tracking CPUs that have been interrupted from idle. +This is a large part of the energy-efficiency requirement, +so I learned of this via an irate phone call. + +

+Because RCU avoids interrupting idle CPUs, it is illegal to +execute an RCU read-side critical section on an idle CPU. +(Kernels built with CONFIG_PROVE_RCU=y will splat +if you try it.) +The RCU_NONIDLE() macro and _rcuidle +event tracing is provided to work around this restriction. +In addition, rcu_is_watching() may be used to +test whether or not it is currently legal to run RCU read-side +critical sections on this CPU. +I learned of the need for diagnostics on the one hand +and RCU_NONIDLE() on the other while inspecting +idle-loop code. +Steven Rostedt supplied _rcuidle event tracing, +which is used quite heavily in the idle loop. + +

+It is similarly socially unacceptable to interrupt an +nohz_full CPU running in userspace. +RCU must therefore track nohz_full userspace +execution. +And in +CONFIG_NO_HZ_FULL_SYSIDLE=y +kernels, RCU must separately track idle CPUs on the one hand and +CPUs that are either idle or executing in userspace on the other. +In both cases, RCU must be able to sample state at two points in +time, and be able to determine whether or not some other CPU spent +any time idle and/or executing in userspace. + +

+These energy-efficiency requirements have proven quite difficult to +understand and to meet, for example, there have been more than five +clean-sheet rewrites of RCU's energy-efficiency code, the last of +which was finally able to demonstrate +real energy savings running on real hardware [PDF]. +As noted earlier, +I learned of many of these requirements via angry phone calls: +Flaming me on the Linux-kernel mailing list was apparently not +sufficient to fully vent their ire at RCU's energy-efficiency bugs! + +

Memory Efficiency

+ +

+Although small-memory non-realtime systems can simply use Tiny RCU, +code size is only one aspect of memory efficiency. +Another aspect is the size of the rcu_head structure +used by call_rcu() and kfree_rcu(). +Although this structure contains nothing more than a pair of pointers, +it does appear in many RCU-protected data structures, including +some that are size critical. +The page structure is a case in point, as evidenced by +the many occurrences of the union keyword within that structure. + +

+This need for memory efficiency is one reason that RCU uses hand-crafted +singly linked lists to track the rcu_head structures that +are waiting for a grace period to elapse. +It is also the reason why rcu_head structures do not contain +debug information, such as fields tracking the file and line of the +call_rcu() or kfree_rcu() that posted them. +Although this information might appear in debug-only kernel builds at some +point, in the meantime, the ->func field will often provide +the needed debug information. + +

+However, in some cases, the need for memory efficiency leads to even +more extreme measures. +Returning to the page structure, the rcu_head field +shares storage with a great many other structures that are used at +various points in the corresponding page's lifetime. +In order to correctly resolve certain +race conditions, +the Linux kernel's memory-management subsystem needs a particular bit +to remain zero during all phases of grace-period processing, +and that bit happens to map to the bottom bit of the +rcu_head structure's ->next field. +RCU makes this guarantee as long as call_rcu() +is used to post the callback, as opposed to kfree_rcu() +or some future “lazy” +variant of call_rcu() that might one day be created for +energy-efficiency purposes. + +

+Performance, Scalability, Response Time, and Reliability

+ +

+Expanding on the +earlier discussion, +RCU is used heavily by hot code paths in performance-critical +portions of the Linux kernel's networking, security, virtualization, +and scheduling code paths. +RCU must therefore use efficient implementations, especially in its +read-side primitives. +To that end, it would be good if preemptible RCU's implementation +of rcu_read_lock() could be inlined, however, doing +this requires resolving #include issues with the +task_struct structure. + +

+The Linux kernel supports hardware configurations with up to +4096 CPUs, which means that RCU must be extremely scalable. +Algorithms that involve frequent acquisitions of global locks or +frequent atomic operations on global variables simply cannot be +tolerated within the RCU implementation. +RCU therefore makes heavy use of a combining tree based on the +rcu_node structure. +RCU is required to tolerate all CPUs continuously invoking any +combination of RCU's runtime primitives with minimal per-operation +overhead. +In fact, in many cases, increasing load must decrease the +per-operation overhead, witness the batching optimizations for +synchronize_rcu(), call_rcu(), +synchronize_rcu_expedited(), and rcu_barrier(). +As a general rule, RCU must cheerfully accept whatever the +rest of the Linux kernel decides to throw at it. + +

+The Linux kernel is used for real-time workloads, especially +in conjunction with the +-rt patchset. +The real-time-latency response requirements are such that the +traditional approach of disabling preemption across RCU +read-side critical sections is inappropriate. +Kernels built with CONFIG_PREEMPT=y therefore +use an RCU implementation that allows RCU read-side critical +sections to be preempted. +This requirement made its presence known after users made it +clear that an earlier +real-time patch +did not meet their needs, in conjunction with some +RCU issues +encountered by a very early version of the -rt patchset. + +

+In addition, RCU must make do with a sub-100-microsecond real-time latency +budget. +In fact, on smaller systems with the -rt patchset, the Linux kernel +provides sub-20-microsecond real-time latencies for the whole kernel, +including RCU. +RCU's scalability and latency must therefore be sufficient for +these sorts of configurations. +To my surprise, the sub-100-microsecond real-time latency budget + +applies to even the largest systems [PDF], +up to and including systems with 4096 CPUs. +This real-time requirement motivated the grace-period kthread, which +also simplified handling of a number of race conditions. + +

+Finally, RCU's status as a synchronization primitive means that +any RCU failure can result in arbitrary memory corruption that can be +extremely difficult to debug. +This means that RCU must be extremely reliable, which in +practice also means that RCU must have an aggressive stress-test +suite. +This stress-test suite is called rcutorture. + +

+Although the need for rcutorture was no surprise, +the current immense popularity of the Linux kernel is posing +interesting—and perhaps unprecedented—validation +challenges. +To see this, keep in mind that there are well over one billion +instances of the Linux kernel running today, given Android +smartphones, Linux-powered televisions, and servers. +This number can be expected to increase sharply with the advent of +the celebrated Internet of Things. + +

+Suppose that RCU contains a race condition that manifests on average +once per million years of runtime. +This bug will be occurring about three times per day across +the installed base. +RCU could simply hide behind hardware error rates, given that no one +should really expect their smartphone to last for a million years. +However, anyone taking too much comfort from this thought should +consider the fact that in most jurisdictions, a successful multi-year +test of a given mechanism, which might include a Linux kernel, +suffices for a number of types of safety-critical certifications. +In fact, rumor has it that the Linux kernel is already being used +in production for safety-critical applications. +I don't know about you, but I would feel quite bad if a bug in RCU +killed someone. +Which might explain my recent focus on validation and verification. + +

Other RCU Flavors

+ +

+One of the more surprising things about RCU is that there are now +no fewer than five flavors, or API families. +In addition, the primary flavor that has been the sole focus up to +this point has two different implementations, non-preemptible and +preemptible. +The other four flavors are listed below, with requirements for each +described in a separate section. + +

    +
  1. Bottom-Half Flavor +
  2. Sched Flavor +
  3. Sleepable RCU +
  4. Tasks RCU +
+ +

Bottom-Half Flavor

+ +

+The softirq-disable (AKA “bottom-half”, +hence the “_bh” abbreviations) +flavor of RCU, or RCU-bh, was developed by +Dipankar Sarma to provide a flavor of RCU that could withstand the +network-based denial-of-service attacks researched by Robert +Olsson. +These attacks placed so much networking load on the system +that some of the CPUs never exited softirq execution, +which in turn prevented those CPUs from ever executing a context switch, +which, in the RCU implementation of that time, prevented grace periods +from ever ending. +The result was an out-of-memory condition and a system hang. + +

+The solution was the creation of RCU-bh, which does +local_bh_disable() +across its read-side critical sections, and which uses the transition +from one type of softirq processing to another as a quiescent state +in addition to context switch, idle, user mode, and offline. +This means that RCU-bh grace periods can complete even when some of +the CPUs execute in softirq indefinitely, thus allowing algorithms +based on RCU-bh to withstand network-based denial-of-service attacks. + +

+Because +rcu_read_lock_bh() and rcu_read_unlock_bh() +disable and re-enable softirq handlers, any attempt to start a softirq +handlers during the +RCU-bh read-side critical section will be deferred. +In this case, rcu_read_unlock_bh() +will invoke softirq processing, which can take considerable time. +One can of course argue that this softirq overhead should be associated +with the code following the RCU-bh read-side critical section rather +than rcu_read_unlock_bh(), but the fact +is that most profiling tools cannot be expected to make this sort +of fine distinction. +For example, suppose that a three-millisecond-long RCU-bh read-side +critical section executes during a time of heavy networking load. +There will very likely be an attempt to invoke at least one softirq +handler during that three milliseconds, but any such invocation will +be delayed until the time of the rcu_read_unlock_bh(). +This can of course make it appear at first glance as if +rcu_read_unlock_bh() was executing very slowly. + +

+The +RCU-bh API +includes +rcu_read_lock_bh(), +rcu_read_unlock_bh(), +rcu_dereference_bh(), +rcu_dereference_bh_check(), +synchronize_rcu_bh(), +synchronize_rcu_bh_expedited(), +call_rcu_bh(), +rcu_barrier_bh(), and +rcu_read_lock_bh_held(). + +

Sched Flavor

+ +

+Before preemptible RCU, waiting for an RCU grace period had the +side effect of also waiting for all pre-existing interrupt +and NMI handlers. +However, there are legitimate preemptible-RCU implementations that +do not have this property, given that any point in the code outside +of an RCU read-side critical section can be a quiescent state. +Therefore, RCU-sched was created, which follows “classic” +RCU in that an RCU-sched grace period waits for for pre-existing +interrupt and NMI handlers. +In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched +APIs have identical implementations, while kernels built with +CONFIG_PREEMPT=y provide a separate implementation for each. + +

+Note well that in CONFIG_PREEMPT=y kernels, +rcu_read_lock_sched() and rcu_read_unlock_sched() +disable and re-enable preemption, respectively. +This means that if there was a preemption attempt during the +RCU-sched read-side critical section, rcu_read_unlock_sched() +will enter the scheduler, with all the latency and overhead entailed. +Just as with rcu_read_unlock_bh(), this can make it look +as if rcu_read_unlock_sched() was executing very slowly. +However, the highest-priority task won't be preempted, so that task +will enjoy low-overhead rcu_read_unlock_sched() invocations. + +

+The +RCU-sched API +includes +rcu_read_lock_sched(), +rcu_read_unlock_sched(), +rcu_read_lock_sched_notrace(), +rcu_read_unlock_sched_notrace(), +rcu_dereference_sched(), +rcu_dereference_sched_check(), +synchronize_sched(), +synchronize_rcu_sched_expedited(), +call_rcu_sched(), +rcu_barrier_sched(), and +rcu_read_lock_sched_held(). +However, anything that disables preemption also marks an RCU-sched +read-side critical section, including +preempt_disable() and preempt_enable(), +local_irq_save() and local_irq_restore(), +and so on. + +

Sleepable RCU

+ +

+For well over a decade, someone saying “I need to block within +an RCU read-side critical section” was a reliable indication +that this someone did not understand RCU. +After all, if you are always blocking in an RCU read-side critical +section, you can probably afford to use a higher-overhead synchronization +mechanism. +However, that changed with the advent of the Linux kernel's notifiers, +whose RCU read-side critical +sections almost never sleep, but sometimes need to. +This resulted in the introduction of +sleepable RCU, +or SRCU. + +

+SRCU allows different domains to be defined, with each such domain +defined by an instance of an srcu_struct structure. +A pointer to this structure must be passed in to each SRCU function, +for example, synchronize_srcu(&ss), where +ss is the srcu_struct structure. +The key benefit of these domains is that a slow SRCU reader in one +domain does not delay an SRCU grace period in some other domain. +That said, one consequence of these domains is that read-side code +must pass a “cookie” from srcu_read_lock() +to srcu_read_unlock(), for example, as follows: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&ss, idx);
+
+
+ +

+As noted above, it is legal to block within SRCU read-side critical sections, +however, with great power comes great responsibility. +If you block forever in one of a given domain's SRCU read-side critical +sections, then that domain's grace periods will also be blocked forever. +Of course, one good way to block forever is to deadlock, which can +happen if any operation in a given domain's SRCU read-side critical +section can block waiting, either directly or indirectly, for that domain's +grace period to elapse. +For example, this results in a self-deadlock: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 synchronize_srcu(&ss);
+ 6 srcu_read_unlock(&ss, idx);
+
+
+ +

+However, if line 5 acquired a mutex that was held across +a synchronize_srcu() for domain ss, +deadlock would still be possible. +Furthermore, if line 5 acquired a mutex that was held across +a synchronize_srcu() for some other domain ss1, +and if an ss1-domain SRCU read-side critical section +acquired another mutex that was held across as ss-domain +synchronize_srcu(), +deadlock would again be possible. +Such a deadlock cycle could extend across an arbitrarily large number +of different SRCU domains. +Again, with great power comes great responsibility. + +

+Unlike the other RCU flavors, SRCU read-side critical sections can +run on idle and even offline CPUs. +This ability requires that srcu_read_lock() and +srcu_read_unlock() contain memory barriers, which means +that SRCU readers will run a bit slower than would RCU readers. +It also motivates the smp_mb__after_srcu_read_unlock() +API, which, in combination with srcu_read_unlock(), +guarantees a full memory barrier. + +

+The +SRCU API +includes +srcu_read_lock(), +srcu_read_unlock(), +srcu_dereference(), +srcu_dereference_check(), +synchronize_srcu(), +synchronize_srcu_expedited(), +call_srcu(), +srcu_barrier(), and +srcu_read_lock_held(). +It also includes +DEFINE_SRCU(), +DEFINE_STATIC_SRCU(), and +init_srcu_struct() +APIs for defining and initializing srcu_struct structures. + +

Tasks RCU

+ +

+Some forms of tracing use “tramopolines” to handle the +binary rewriting required to install different types of probes. +It would be good to be able to free old trampolines, which sounds +like a job for some form of RCU. +However, because it is necessary to be able to install a trace +anywhere in the code, it is not possible to use read-side markers +such as rcu_read_lock() and rcu_read_unlock(). +In addition, it does not work to have these markers in the trampoline +itself, because there would need to be instructions following +rcu_read_unlock(). +Although synchronize_rcu() would guarantee that execution +reached the rcu_read_unlock(), it would not be able to +guarantee that execution had completely left the trampoline. + +

+The solution, in the form of +Tasks RCU, +is to have implicit +read-side critical sections that are delimited by voluntary context +switches, that is, calls to schedule(), +cond_resched_rcu_qs(), and +synchronize_rcu_tasks(). +In addition, transitions to and from userspace execution also delimit +tasks-RCU read-side critical sections. + +

+The tasks-RCU API is quite compact, consisting only of +call_rcu_tasks(), +synchronize_rcu_tasks(), and +rcu_barrier_tasks(). + +

Possible Future Changes

+ +

+One of the tricks that RCU uses to attain update-side scalability is +to increase grace-period latency with increasing numbers of CPUs. +If this becomes a serious problem, it will be necessary to rework the +grace-period state machine so as to avoid the need for the additional +latency. + +

+Expedited grace periods scan the CPUs, so their latency and overhead +increases with increasing numbers of CPUs. +If this becomes a serious problem on large systems, it will be necessary +to do some redesign to avoid this scalability problem. + +

+RCU disables CPU hotplug in a few places, perhaps most notably in the +expedited grace-period and rcu_barrier() operations. +If there is a strong reason to use expedited grace periods in CPU-hotplug +notifiers, it will be necessary to avoid disabling CPU hotplug. +This would introduce some complexity, so there had better be a very +good reason. + +

+The tradeoff between grace-period latency on the one hand and interruptions +of other CPUs on the other hand may need to be re-examined. +The desire is of course for zero grace-period latency as well as zero +interprocessor interrupts undertaken during an expedited grace period +operation. +While this ideal is unlikely to be achievable, it is quite possible that +further improvements can be made. + +

+The multiprocessor implementations of RCU use a combining tree that +groups CPUs so as to reduce lock contention and increase cache locality. +However, this combining tree does not spread its memory across NUMA +nodes nor does it align the CPU groups with hardware features such +as sockets or cores. +Such spreading and alignment is currently believed to be unnecessary +because the hotpath read-side primitives do not access the combining +tree, nor does call_rcu() in the common case. +If you believe that your architecture needs such spreading and alignment, +then your architecture should also benefit from the +rcutree.rcu_fanout_leaf boot parameter, which can be set +to the number of CPUs in a socket, NUMA node, or whatever. +If the number of CPUs is too large, use a fraction of the number of +CPUs. +If the number of CPUs is a large prime number, well, that certainly +is an “interesting” architectural choice! +More flexible arrangements might be considered, but only if +rcutree.rcu_fanout_leaf has proven inadequate, and only +if the inadequacy has been demonstrated by a carefully run and +realistic system-level workload. + +

+Please note that arrangements that require RCU to remap CPU numbers will +require extremely good demonstration of need and full exploration of +alternatives. + +

+There is an embarrassingly large number of flavors of RCU, and this +number has been increasing over time. +Perhaps it will be possible to combine some at some future date. + +

+RCU's various kthreads are reasonably recent additions. +It is quite likely that adjustments will be required to more gracefully +handle extreme loads. +It might also be necessary to be able to relate CPU utilization by +RCU's kthreads and softirq handlers to the code that instigated this +CPU utilization. +For example, RCU callback overhead might be charged back to the +originating call_rcu() instance, though probably not +in production kernels. + +

Summary

+ +

+This document has presented more than two decade's worth of RCU +requirements. +Given that the requirements keep changing, this will not be the last +word on this subject, but at least it serves to get an important +subset of the requirements set forth. + +

Acknowledgments

+ +I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, +Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and +Andy Lutomirski for their help in rendering +this article human readable, and to Michelle Rankin for her support +of this effort. +Other contributions are acknowledged in the Linux kernel's git archive. +The cartoon is copyright (c) 2013 by Melissa Broussard, +and is provided +under the terms of the Creative Commons Attribution-Share Alike 3.0 +United States license. + +

@@QQAL@@ + + diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh new file mode 100755 index 000000000..d354f0695 --- /dev/null +++ b/Documentation/RCU/Design/htmlqqz.sh @@ -0,0 +1,108 @@ +#!/bin/sh +# +# Usage: sh htmlqqz.sh file +# +# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. +# Commands, all of which must be on a line by themselves: +# +# "

@@QQ@@": Start of a quick quiz. +# "

@@QQA@@": Start of a quick-quiz answer. +# "

@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. +# "

@@QQAL@@": Place to put quick-quiz answer list. +# +# Places the result in file.html. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. + +fn=$1 +if test ! -r $fn.htmlx +then + echo "Error: $fn.htmlx unreadable." + exit 1 +fi + +echo "" > $fn.html +echo "" >> $fn.html +awk < $fn.htmlx >> $fn.html ' + +state == "" && $1 != "

@@QQ@@" && $1 != "

@@QQAL@@" { + print $0; + if ($0 ~ /^

@@QQ/) + print "Bad Quick Quiz command: " NR " (expected

@@QQ@@ or

@@QQAL@@)." > "/dev/stderr" + next; +} + +state == "" && $1 == "

@@QQ@@" { + qqn++; + qqlineno = NR; + haveqq = 1; + state = "qq"; + print "

Quick Quiz " qqn ":" + next; +} + +state == "qq" && $1 != "

@@QQA@@" { + qq[qqn] = qq[qqn] $0 "\n"; + print $0 + if ($0 ~ /^

@@QQ/) + print "Bad Quick Quiz command: " NR ". (expected

@@QQA@@)" > "/dev/stderr" + next; +} + +state == "qq" && $1 == "

@@QQA@@" { + state = "qqa"; + print "
Answer" + next; +} + +state == "qqa" && $1 != "

@@QQE@@" { + qqa[qqn] = qqa[qqn] $0 "\n"; + if ($0 ~ /^

@@QQ/) + print "Bad Quick Quiz command: " NR " (expected

@@QQE@@)." > "/dev/stderr" + next; +} + +state == "qqa" && $1 == "

@@QQE@@" { + state = ""; + next; +} + +state == "" && $1 == "

@@QQAL@@" { + haveqq = ""; + print "

" + print "Answers to Quick Quizzes

" + print ""; + for (i = 1; i <= qqn; i++) { + print "" + print "

Quick Quiz " i ":" + print qq[i]; + print ""; + print "

Answer:" + print qqa[i]; + print ""; + print "

Back to Quick Quiz " i "." + print ""; + } + next; +} + +END { + if (state != "") + print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" + else if (haveqq) + print "Missing \"

@@QQAL@@\", no Quick Quiz." > "/dev/stderr" +}' diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index f40578026..7785fb5eb 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -375,7 +375,8 @@ int main(int argc, char *argv[]) } } - if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0) + nl_sd = create_nl_socket(NETLINK_GENERIC); + if (nl_sd < 0) err(1, "error creating Netlink socket\n"); diff --git a/Documentation/arm/Marvell/README b/Documentation/arm/Marvell/README index 18a775d10..ae89b67d8 100644 --- a/Documentation/arm/Marvell/README +++ b/Documentation/arm/Marvell/README @@ -233,29 +233,30 @@ MMP/MMP2 family (communication processor) Linux kernel mach directory: arch/arm/mach-mmp Linux kernel plat directory: arch/arm/plat-pxa -Berlin family (Digital Entertainment) +Berlin family (Multimedia Solutions) ------------------------------------- Flavors: - 88DE3005, Armada 1500-mini + 88DE3005, Armada 1500 Mini Design name: BG2CD Core: ARM Cortex-A9, PL310 L2CC - Homepage: http://www.marvell.com/digital-entertainment/armada-1500-mini/ + Homepage: http://www.marvell.com/multimedia-solutions/armada-1500-mini/ + 88DE3006, Armada 1500 Mini Plus + Design name: BG2CDP + Core: Dual Core ARM Cortex-A7 + Homepage: http://www.marvell.com/multimedia-solutions/armada-1500-mini-plus/ 88DE3100, Armada 1500 Design name: BG2 Core: Marvell PJ4B (ARMv7), Tauros3 L2CC - Homepage: http://www.marvell.com/digital-entertainment/armada-1500/ - Product Brief: http://www.marvell.com/digital-entertainment/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf + Product Brief: http://www.marvell.com/multimedia-solutions/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf 88DE3114, Armada 1500 Pro - Design name: BG2-Q + Design name: BG2Q Core: Quad Core ARM Cortex-A9, PL310 L2CC - Homepage: http://www.marvell.com/digital-entertainment/armada-1500-pro/ - Product Brief: http://www.marvell.com/digital-entertainment/armada-1500-pro/assets/Marvell_ARMADA_1500_PRO-01_product_brief.pdf 88DE???? Design name: BG3 Core: ARM Cortex-A15, CA15 integrated L2CC - Homepage: http://www.marvell.com/digital-entertainment/ + Homepage: http://www.marvell.com/multimedia-solutions/ Directory: arch/arm/mach-berlin Comments: diff --git a/Documentation/arm/pxa/mfp.txt b/Documentation/arm/pxa/mfp.txt index a179e5bc0..0b7cab978 100644 --- a/Documentation/arm/pxa/mfp.txt +++ b/Documentation/arm/pxa/mfp.txt @@ -49,7 +49,7 @@ to this new MFP mechanism, here are several key points: internal controllers like PWM, SSP and UART, with 128 internal signals which can be routed to external through one or more MFPs (e.g. GPIO<0> can be routed through either MFP_PIN_GPIO0 as well as MFP_PIN_GPIO0_2, - see arch/arm/mach-pxa/mach/include/mfp-pxa300.h) + see arch/arm/mach-pxa/mfp-pxa300.h) 2. Alternate function configuration is removed from this GPIO controller, the remaining functions are pure GPIO-specific, i.e. @@ -76,11 +76,11 @@ For board code writers, here are some guidelines: 1. include ONE of the following header files in your .c: - - #include - - #include - - #include - - #include - - #include + - #include "mfp-pxa25x.h" + - #include "mfp-pxa27x.h" + - #include "mfp-pxa300.h" + - #include "mfp-pxa320.h" + - #include "mfp-pxa930.h" NOTE: only one file in your .c, depending on the processors used, because pin configuration definitions may conflict in these file (i.e. @@ -203,20 +203,20 @@ make them effective there-after. 1. Unified pin definitions - enum constants for all configurable pins 2. processor-neutral bit definitions for a possible MFP configuration - - arch/arm/mach-pxa/include/mach/mfp-pxa3xx.h + - arch/arm/mach-pxa/mfp-pxa3xx.h for PXA3xx specific MFPR register bit definitions and PXA3xx common pin configurations - - arch/arm/mach-pxa/include/mach/mfp-pxa2xx.h + - arch/arm/mach-pxa/mfp-pxa2xx.h for PXA2xx specific definitions and PXA25x/PXA27x common pin configurations - - arch/arm/mach-pxa/include/mach/mfp-pxa25x.h - arch/arm/mach-pxa/include/mach/mfp-pxa27x.h - arch/arm/mach-pxa/include/mach/mfp-pxa300.h - arch/arm/mach-pxa/include/mach/mfp-pxa320.h - arch/arm/mach-pxa/include/mach/mfp-pxa930.h + - arch/arm/mach-pxa/mfp-pxa25x.h + arch/arm/mach-pxa/mfp-pxa27x.h + arch/arm/mach-pxa/mfp-pxa300.h + arch/arm/mach-pxa/mfp-pxa320.h + arch/arm/mach-pxa/mfp-pxa930.h for processor specific definitions diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt new file mode 100644 index 000000000..58b71ddf9 --- /dev/null +++ b/Documentation/arm64/silicon-errata.txt @@ -0,0 +1,58 @@ + Silicon Errata and Software Workarounds + ======================================= + +Author: Will Deacon +Date : 27 November 2015 + +It is an unfortunate fact of life that hardware is often produced with +so-called "errata", which can cause it to deviate from the architecture +under specific circumstances. For hardware produced by ARM, these +errata are broadly classified into the following categories: + + Category A: A critical error without a viable workaround. + Category B: A significant or critical error with an acceptable + workaround. + Category C: A minor error that is not expected to occur under normal + operation. + +For more information, consult one of the "Software Developers Errata +Notice" documents available on infocenter.arm.com (registration +required). + +As far as Linux is concerned, Category B errata may require some special +treatment in the operating system. For example, avoiding a particular +sequence of code, or configuring the processor in a particular way. A +less common situation may require similar actions in order to declassify +a Category A erratum into a Category C erratum. These are collectively +known as "software workarounds" and are only required in the minority of +cases (e.g. those cases that both require a non-secure workaround *and* +can be triggered by Linux). + +For software workarounds that may adversely impact systems unaffected by +the erratum in question, a Kconfig entry is added under "Kernel +Features" -> "ARM errata workarounds via the alternatives framework". +These are enabled by default and patched in at runtime when an affected +CPU is detected. For less-intrusive workarounds, a Kconfig option is not +available and the code is structured (preferably with a comment) in such +a way that the erratum will not be hit. + +This approach can make it slightly onerous to determine exactly which +errata are worked around in an arbitrary kernel source tree, so this +file acts as a registry of software workarounds in the Linux Kernel and +will be updated when new workarounds are committed and backported to +stable kernels. + +| Implementor | Component | Erratum ID | Kconfig | ++----------------+-----------------+-----------------+-------------------------+ +| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | +| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | +| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 | +| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 | +| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 | +| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | +| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | +| ARM | Cortex-A57 | #852523 | N/A | +| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | +| | | | | +| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | +| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index f3bc72945..1e4f835a6 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt @@ -81,14 +81,13 @@ on higher end storage. Default value for this parameter is 8ms. -latency -------- -This parameter is used to enable/disable the latency mode of the CFQ -scheduler. If latency mode (called low_latency) is enabled, CFQ tries -to recompute the slice time for each process based on the target_latency set -for the system. This favors fairness over throughput. Disabling low -latency (setting it to 0) ignores target latency, allowing each process in the -system to get a full time slice. +low_latency +----------- +This parameter is used to enable/disable the low latency mode of the CFQ +scheduler. If enabled, CFQ tries to recompute the slice time for each process +based on the target_latency set for the system. This favors fairness over +throughput. Disabling low latency (setting it to 0) ignores target latency, +allowing each process in the system to get a full time slice. By default low latency mode is enabled. diff --git a/Documentation/cgroup-v1/00-INDEX b/Documentation/cgroup-v1/00-INDEX new file mode 100644 index 000000000..6ad425f7c --- /dev/null +++ b/Documentation/cgroup-v1/00-INDEX @@ -0,0 +1,28 @@ +00-INDEX + - this file +blkio-controller.txt + - Description for Block IO Controller, implementation and usage details. +cgroups.txt + - Control Groups definition, implementation details, examples and API. +cpuacct.txt + - CPU Accounting Controller; account CPU usage for groups of tasks. +cpusets.txt + - documents the cpusets feature; assign CPUs and Mem to a set of tasks. +devices.txt + - Device Whitelist Controller; description, interface and security. +freezer-subsystem.txt + - checkpointing; rationale to not use signals, interface. +hugetlb.txt + - HugeTLB Controller implementation and usage details. +memcg_test.txt + - Memory Resource Controller; implementation details. +memory.txt + - Memory Resource Controller; design, accounting, interface, testing. +net_cls.txt + - Network classifier cgroups details and usages. +net_prio.txt + - Network priority cgroups details and usages. +pids.txt + - Process number cgroups details and usages. +unified-hierarchy.txt + - Description the new/next cgroup interface. diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt new file mode 100644 index 000000000..673dc34d3 --- /dev/null +++ b/Documentation/cgroup-v1/blkio-controller.txt @@ -0,0 +1,375 @@ + Block IO Controller + =================== +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +Currently two IO control policies are implemented. First one is proportional +weight time based division of disk policy. It is implemented in CFQ. Hence +this policy takes effect only on leaf nodes when CFQ is being used. The second +one is throttling policy which can be used to specify upper IO rate limits +on devices. This policy is implemented in generic block layer and can be +used on leaf nodes as well as higher level logical devices like device mapper. + +HOWTO +===== +Proportional Weight division of bandwidth +----------------------------------------- +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable Block IO controller + CONFIG_BLK_CGROUP=y + +- Enable group scheduling in CFQ + CONFIG_CFQ_GROUP_IOSCHED=y + +- Compile and boot into kernel and mount IO controller (blkio); see + cgroups.txt, Why are cgroups needed?. + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/blkio + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Create two cgroups + mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 + +- Set weights of group test1 and test2 + echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight + echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /sys/fs/cgroup/blkio/test1/tasks + cat /sys/fs/cgroup/blkio/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /sys/fs/cgroup/blkio/test2/tasks + cat /sys/fs/cgroup/blkio/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at blkio.disk_time and + blkio.disk_sectors files of both test1 and test2 groups. This will tell how + much disk time (in milliseconds), each group got and how many sectors each + group dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller (see cgroups.txt, Why are cgroups needed?) + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is ": ". + + echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not. + + # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.throttle.write_bps_device file. + +Hierarchical Cgroups +==================== + +Both CFQ and throttling implement hierarchy support; however, +throttling's hierarchy support is enabled iff "sane_behavior" is +enabled from cgroup side, which currently is a development option and +not publicly available. + +If somebody created a hierarchy like as follows. + + root + / \ + test1 test2 + | + test3 + +CFQ by default and throttling with "sane_behavior" will handle the +hierarchy correctly. For details on CFQ hierarchy support, refer to +Documentation/block/cfq-iosched.txt. For throttling, all limits apply +to the whole subtree while all statistics are local to the IOs +directly generated by tasks in that cgroup. + +Throttling without "sane_behavior" enabled from cgroup side will +practically treat all groups at same level as if it looks like the +following. + + pivot + / / \ \ + root test1 test2 test3 + +Various user visible config options +=================================== +CONFIG_BLK_CGROUP + - Block IO controller. + +CONFIG_DEBUG_BLK_CGROUP + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. + +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + +Details of cgroup files +======================= +Proportional weight policy files +-------------------------------- +- blkio.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). + Currently allowed range of weights is from 10 to 1000. + +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format. + + # echo dev_maj:dev_minor weight > blkio.weight_device + Configure weight=300 on /dev/sdb (8:16) in this cgroup + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the existing ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +Common files among various policies +----------------------------------- +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. + +CFQ sysfs tunable +================= +/sys/block//queue/iosched/slice_idle +------------------------------------------ +On a faster hardware CFQ can be slow, especially with sequential workload. +This happens because CFQ idles on a single queue and single queue might not +drive deeper request queue depths to keep the storage busy. In such scenarios +one can try setting slice_idle=0 and that would switch CFQ to IOPS +(IO operations per second) mode on NCQ supporting hardware. + +That means CFQ will not idle between cfq queues of a cfq group and hence be +able to driver higher queue depth and achieve better throughput. That also +means that cfq provides fairness among groups in terms of IOPS and not in +terms of disk time. + +/sys/block//queue/iosched/group_idle +------------------------------------------ +If one disables idling on individual cfq queues and cfq service trees by +setting slice_idle=0, group_idle kicks in. That means CFQ will still idle +on the group in an attempt to provide fairness among groups. + +By default group_idle is same as slice_idle and does not do anything if +slice_idle is enabled. + +One can experience an overall throughput drop if you have created multiple +groups and put applications in that group which are not driving enough +IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle +on individual groups and throughput should improve. diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt new file mode 100644 index 000000000..c6256ae98 --- /dev/null +++ b/Documentation/cgroup-v1/cgroups.txt @@ -0,0 +1,682 @@ + CGROUPS + ------- + +Written by Paul Menage based on +Documentation/cgroups/cpusets.txt + +Original copyright statements from cpusets.txt: +Portions Copyright (C) 2004 BULL SA. +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter + +CONTENTS: +========= + +1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes + 2.3 Mounting hierarchies by name +3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API +4. Extended attributes usage +5. Questions + +1. Control Groups +================= + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierarchies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User-level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task PIDs assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource-tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +up in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines: + + CPU : "Top cpuset" + / \ + CPUSet1 CPUSet2 + | | + (Professors) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), Students (30%), system (20%) + + Disk : Professors (50%), Students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Professors (15%) students (5%) + +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes +into the NFS network class. + +At the same time Firefox/Lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies), +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can + + # echo browser_pid > /sys/fs/cgroup///tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +appropriate network and other resource class. This may lead to +proliferation of such cgroups. + +Also let's say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :)) OR give one of the student's simulation +apps enhanced CPU power. + +With ability to write PIDs directly to resource classes, it's just a +matter of: + + # echo pid > /sys/fs/cgroup/network//tasks + (after some time) + # echo pid > /sys/fs/cgroup/network//tasks + +Without this ability, the administrator would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by PID) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance-critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition, a new file system of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by PID) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread ID into this file + moves the thread into this cgroup. + - cgroup.procs: list of thread group IDs in the cgroup. This list is + not guaranteed to be sorted or free of duplicate TGIDs, and userspace + should sort/uniquify the list if this property is required. + Writing a thread group ID into this file moves all threads in that + group into this cgroup. + - notify_on_release flag: run the release agent on exit? + - release_agent: the path to use for release notifications (this file + exists in the top cgroup only) + +Other subsystems such as cpusets may add additional files in each +cgroup dir. + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, otherwise a new +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cgrp_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents' notify_on_release settings. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 What does clone_children do ? +--------------------------------- + +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. + +1.6 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like: + + 1) mount -t tmpfs cgroup_root /sys/fs/cgroup + 2) mkdir /sys/fs/cgroup/cpuset + 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 4) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 5) Start a task that will be the "founding father" of the new job. + 6) Attach that task to the new cgroup by writing its PID to the + /sys/fs/cgroup/cpuset tasks file for that cgroup. + 7) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/cpuset + mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy with all available subsystems, type: +# mount -t cgroup xxx /sys/fs/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +Note: Some subsystems do not work without some user input first. For instance, +if cpusets are enabled the user will have to populate the cpus and mems files +for each new cgroup created before that group can be used. + +As explained in section `1.2 Why are cgroups needed?' you should create +different hierarchies of cgroups for each single resource or group of +resources you want to control. Therefore, you should mount a tmpfs on +/sys/fs/cgroup and create directories for each cgroup resource or resource +group. + +# mount -t tmpfs cgroup_root /sys/fs/cgroup +# mkdir /sys/fs/cgroup/rg1 + +To mount a cgroup hierarchy with just the cpuset and memory +subsystems, type: +# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 + +While remounting cgroups is currently supported, it is not recommend +to use it. Remounting allows changing bound subsystems and +release_agent. Rebinding is hardly useful as it only works when the +hierarchy is empty and release_agent itself should be replaced with +conventional fsnotify. The support for remounting will be removed in +the future. + +To Specify a hierarchy's release_agent: +# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ + xxx /sys/fs/cgroup/rg1 + +Note that specifying 'release_agent' more than once will return failure. + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 +is the cgroup that holds the whole system. + +If you want to change the value of release_agent: +# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent + +It can also be changed via remount. + +If you want to create a new cgroup under /sys/fs/cgroup/rg1: +# cd /sys/fs/cgroup/rg1 +# mkdir my_cgroup + +Now you want to do something with this cgroup. +# cd my_cgroup + +In this directory you can find several files: +# ls +cgroup.procs notify_on_release tasks +(plus whatever files added by the attached subsystems) + +Now attach your shell to this cgroup: +# /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cgroup, just use rmdir: +# rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + +You can attach the current shell task by echoing 0: + +# echo 0 > tasks + +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the PID of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + +Note: Since every task is always a member of exactly one cgroup in each +mounted hierarchy, to remove a task from its current cgroup you must +move it into a new cgroup (possibly the root cgroup) by writing to the +new cgroup's tasks file. + +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. + +2.3 Mounting hierarchies by name +-------------------------------- + +Passing the name= option when mounting a cgroups hierarchy +associates the given name with the hierarchy. This can be used when +mounting a pre-existing hierarchy, in order to refer to it by name +rather than by its set of active subsystems. Each hierarchy is either +nameless, or has a unique name. + +The name should match [\w.-]+ + +When passing a name= option for a new hierarchy, you need to +specify subsystems manually; the legacy behaviour of mounting all +subsystems when none are explicitly specified is not supported when +you give a subsystem a name. + +The name of the subsystem appears as part of the hierarchy description +in /proc/mounts and /proc//cgroups. + + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem ID which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be managing. + +- name: should be initialized to a unique subsystem name. Should be + no longer than MAX_CGROUP_TYPE_NAMELEN. + +- early_init: indicate if the subsystem needs early initialization + at system boot. + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem ID; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +----------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called _subsys + +If a subsystem can be compiled as a module, it should also have in its +module initcall a call to cgroup_load_subsys(), and in its exitcall a +call to cgroup_unload_subsys(). It should also set its_subsys.module = +THIS_MODULE in its .c file. + +Each subsystem may export the following methods. The only mandatory +methods are css_alloc/free. Any others that are null are presumed to +be successful no-ops. + +struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +Called to allocate a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +ERR_PTR() value. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +int css_online(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. + +void css_offline(struct cgroup *cgrp); +(cgroup_mutex held by caller) + +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +void css_free(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). + +int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +(cgroup_mutex held by caller) + +Called prior to moving one or more tasks into a cgroup; if the +subsystem returns an error, this will abort the attach operation. +@tset contains the tasks to be attached and is guaranteed to have at +least one task in it. + +If there are multiple tasks in the taskset, then: + - it's guaranteed that all are from the same thread group + - @tset contains all tasks from the thread group whether or not + they're switching cgroups + - the first task is the leader + +Each @tset entry also contains the task's old cgroup and tasks which +aren't switching cgroup can be skipped easily using the +cgroup_taskset_for_each() iterator. Note that this isn't called on a +fork. If this method returns 0 (success) then this should remain valid +while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. + +void css_reset(struct cgroup_subsys_state *css) +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + +void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsystem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. The parameters are identical to can_attach(). + +void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +(cgroup_mutex held by caller) + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. +The parameters are identical to can_attach(). + +void fork(struct task_struct *task) + +Called when a task is forked into a cgroup. + +void exit(struct task_struct *task) + +Called during task exit. + +void free(struct task_struct *task) + +Called when the task_struct is freed. + +void bind(struct cgroup *root) +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Extended attribute usage +=========================== + +cgroup filesystem supports certain types of extended attributes in its +directories and files. The current supported types are: + - Trusted (XATTR_TRUSTED) + - Security (XATTR_SECURITY) + +Both require CAP_SYS_ADMIN capability to set. + +Like in tmpfs, the extended attributes in cgroup filesystem are stored +using kernel memory and it's advised to keep the usage at minimum. This +is the reason why user defined extended attributes are not supported, since +any user can do it and there's no limit in the value size. + +The current known users for this feature are SELinux to limit cgroup usage +in containers and systemd for assorted meta data like main PID in a cgroup +(systemd creates a cgroup per service). + +5. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE PID. + diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt new file mode 100644 index 000000000..9d73cc0ca --- /dev/null +++ b/Documentation/cgroup-v1/cpuacct.txt @@ -0,0 +1,49 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -ocpuacct none /sys/fs/cgroup + +With the above step, the initial or the parent accounting group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. +/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained +by this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /sys/fs/cgroup. + +# cd /sys/fs/cgroup +# mkdir g1 +# echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/sys/fs/cgroup/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt new file mode 100644 index 000000000..fdf7dff3f --- /dev/null +++ b/Documentation/cgroup-v1/cpusets.txt @@ -0,0 +1,839 @@ + CPUSETS + ------- + +Copyright (C) 2004 BULL SA. +Written by Simon.Derr@bull.net + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter +Modified by Paul Menage +Modified by Hidetoshi Seto + +CONTENTS: +========= + +1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes +3. Questions +4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a task's current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroups/cgroups.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that task's cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting task's mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that task's cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that task's cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendants) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that task's cpuset. + - in sched.c migrate_live_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that task's cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the task's cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpuset.cpus: list of CPUs in that cpuset + - cpuset.mems: list of Memory Nodes in that cpuset + - cpuset.memory_migrate flag: if set, move pages to cpusets nodes + - cpuset.cpu_exclusive flag: is cpu placement exclusive? + - cpuset.mem_exclusive flag: is memory placement exclusive? + - cpuset.mem_hardwall flag: is memory allocation hardwalled + - cpuset.memory_pressure: measure of how much paging pressure in cpuset + - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset + - cpuset.sched_relax_domain_level: the searching range when migrating tasks + +In addition, only the root cpuset has the following file: + - cpuset.memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_mask using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendant, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned to them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'cpuset.memory_spread_page' and +'cpuset.memory_spread_slab'. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the task's NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the task's NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing task's memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PFA_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'cpuset.memory_spread_slab' turns on the flag +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current task's mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched/core.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"cpuset.sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendant cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside its cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "cpuset.sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of struct cpumask) of CPUs, pairwise disjoint, that cover +all the CPUs that must be load balanced. + +The cpuset code builds a new such partition and passes it to the +scheduler sched domain setup code, to have the sched domains rebuilt +as necessary, whenever: + - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, + - or CPUs come or go from a cpuset with this flag enabled, + - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs + and with this flag enabled changes, + - or a cpuset with non-empty CPUs and with this flag enabled is removed, + - or a cpu is offlined/onlined. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (struct cpumask) in the +partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +every time. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searches all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'cpuset.sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + + -1 : no request. use system default or follow request of others. + 0 : no search. + 1 : search siblings (hyperthreads in a core). + 2 : search cores in a package. + 3 : search cpus in a node [= system wide on non-NUMA system] + 4 : search nodes in a chunk of node [on NUMA system] + 5 : search system wide [on NUMA system] + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset +is disabled, then 'cpuset.sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not depends on your situation. +Don't modify this file if you are not sure. + +If your situation is: + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. +then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the task's cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the task's +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a task's pid is written to another cpusets 'cpuset.tasks' file, then its +allowed CPU placement is changed immediately. If such a task had been +bound to some subset of its cpuset using the sched_setaffinity() call, +the task will be allowed to run on any CPU allowed in its new cpuset, +negating the effect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +and the processor placement is updated immediately. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'cpuset.mems' subsequently changes. +If the cpuset flag file 'cpuset.memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the task's new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'cpuset.memory_migrate' is set true, then if that cpuset's +'cpuset.mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'cpuset.mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the task's prior cpuset, or in the cpuset's +prior 'cpuset.mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current task's cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /sys/fs/cgroup/cpuset + 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /sys/fs/cgroup/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset: + + mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +There are ways to query or modify cpusets: + - via the cpuset file system directly, using the various cd, mkdir, echo, + cat, rmdir commands from the shell, or their equivalent from C. + - via the C library libcpuset. + - via the C library libcgroup. + (http://sourceforge.net/projects/libcg/) + - via the python application cset. + (http://code.google.com/p/cpuset/) + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset + +Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /sys/fs/cgroup/cpuset: +# cd /sys/fs/cgroup/cpuset +# mkdir my_cpuset + +Now you want to do something with this cpuset. +# cd my_cpuset + +In this directory you can find several files: +# ls +cgroup.clone_children cpuset.memory_pressure +cgroup.event_control cpuset.memory_spread_page +cgroup.procs cpuset.memory_spread_slab +cpuset.cpu_exclusive cpuset.mems +cpuset.cpus cpuset.sched_load_balance +cpuset.mem_exclusive cpuset.sched_relax_domain_level +cpuset.mem_hardwall notify_on_release +cpuset.memory_migrate tasks + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags: +# /bin/echo 1 > cpuset.cpu_exclusive + +Add some cpus: +# /bin/echo 0-7 > cpuset.cpus + +Add some mems: +# /bin/echo 0-7 > cpuset.mems + +Now attach your shell to this cpuset: +# /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cpuset, just use rmdir: +# rmdir my_sub_cs +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /sys/fs/cgroup/cpuset + +is equivalent to + +mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset +echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories: + +# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 +# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + +To add a CPU to a cpuset, write the new list of CPUs including the +CPU to be added. To add 6 to the above cpuset: + +# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 + +Similarly to remove a CPU from a cpuset, write the new list of CPUs +without the CPU to be removed. + +To remove all the CPUs: + +# /bin/echo "" > cpuset.cpus -> clear cpus list + +2.3 Setting flags +----------------- + +The syntax is very simple: + +# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' +# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' + +2.4 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.txt new file mode 100644 index 000000000..3c1095ca0 --- /dev/null +++ b/Documentation/cgroup-v1/devices.txt @@ -0,0 +1,116 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /sys/fs/cgroup/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing + + echo a > /sys/fs/cgroup/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendant of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. + +4. Hierarchy + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example: + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A: + # echo "c 116:* r" > A/devices.deny +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated: + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding "c *:3 rwm": + # echo "c *:3 rwm" >A/devices.allow + +the result: + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B: + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow +or even + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt new file mode 100644 index 000000000..e831cb2b8 --- /dev/null +++ b/Documentation/cgroup-v1/freezer-subsystem.txt @@ -0,0 +1,123 @@ +The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + +The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16690 + + + +This happens because bash can observe both signals and choose how it +responds to them. + +Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + +In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks belonging to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. + +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. + +* Examples of usage : + + # mkdir /sys/fs/cgroup/freezer + # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer + # mkdir /sys/fs/cgroup/freezer/0 + # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks + +to get status of the freezer subsystem : + + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +to freeze all tasks in the container : + + # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + FREEZING + # cat /sys/fs/cgroup/freezer/0/freezer.state + FROZEN + +to unfreeze all tasks in the container : + + # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt new file mode 100644 index 000000000..106245c3a --- /dev/null +++ b/Documentation/cgroup-v1/hugetlb.txt @@ -0,0 +1,45 @@ +HugeTLB Controller +------------------- + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup. + +# cd /sys/fs/cgroup +# mkdir g1 +# echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting two hugepage size (16M and 16G) the control +files include: + +hugetlb.16GB.limit_in_bytes +hugetlb.16GB.max_usage_in_bytes +hugetlb.16GB.usage_in_bytes +hugetlb.16GB.failcnt +hugetlb.16MB.limit_in_bytes +hugetlb.16MB.max_usage_in_bytes +hugetlb.16MB.usage_in_bytes +hugetlb.16MB.failcnt diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt new file mode 100644 index 000000000..8870b0212 --- /dev/null +++ b/Documentation/cgroup-v1/memcg_test.txt @@ -0,0 +1,280 @@ +Memory Resource Controller(Memcg) Implementation Memo. +Last Updated: 2010/2 +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroups/memory.txt) + +0. How to record usage ? + 2 objects are used. + + page_cgroup ....an object per page. + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_try_charge() + +2. Uncharge + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + +3. charge-commit-cancel + Memcg pages are charged in two steps: + mem_cgroup_try_charge() + mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the page is associated with the memcg. + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + +5. Page Cache + Page Cache is charged at + - add_to_page_cache_locked(). + + The logic is very clear. (About migration, see below) + Note: __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache + The best way to understand shmem's page state transition is to read + mm/shmem.c. + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + +7. Page Migration + + mem_cgroup_migrate() + +8. LRU + Each memcg has its own private LRU. Now, its handling is under global + VM's control (means that it's handled under global zone->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under zone->lru_lock(). + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. + + Tests for racy cases. + + 9.1 Small limit to memcg. + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + + 9.2 Shmem + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + + 9.3 Migration + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration. + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset. + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + 9.4 Memory hotplug. + memory hotplug test is one of good test. + to offline memory, do following. + # echo offline > /sys/devices/system/memory/memoryXXX/state + (XXX is the place of memory) + This is an easy way to test page migration, too. + + 9.5 mkdir/rmdir + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following. + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running. + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + + 9.6 Mount with other subsystems. + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example) + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. + + 9.7 swapoff. + Besides management of swap is one of complicated parts of memcg, + call path of swap-in at swapoff is not same as usual swap-in path.. + It's worth to be tested explicitly. + + For example, test like following is good. + (Shell-A) + # mount -t cgroup none /cgroup -o memory + # mkdir /cgroup/test + # echo 40M > /cgroup/test/memory.limit_in_bytes + # echo 0 > /cgroup/test/tasks + Run malloc(100M) program under this. You'll see 60M of swaps. + (Shell-B) + # move all tasks in /cgroup/test to /cgroup + # /sbin/swapoff -a + # rmdir /cgroup/test + # kill malloc task. + + Of course, tmpfs v.s. swapoff test should be tested, too. + + 9.8 OOM-Killer + Out-of-memory caused by memcg's limit will kill tasks under + the memcg. When hierarchy is used, a task under hierarchy + will be killed by the kernel. + In this case, panic_on_oom shouldn't be invoked and tasks + in other groups shouldn't be killed. + + It's not difficult to cause OOM under memcg as following. + Case A) when you can swapoff + #swapoff -a + #echo 50M > /memory.limit_in_bytes + run 51M of malloc + + Case B) when you use mem+swap limitation. + #echo 50M > memory.limit_in_bytes + #echo 50M > memory.memsw.limit_in_bytes + run 51M of malloc + + 9.9 Move charges at task migration + Charges associated with a task can be moved along with task migration. + + (Shell-A) + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B) + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading *.usage_in_bytes or + memory.stat of both A and B. + See 8.2 of Documentation/cgroups/memory.txt to see what value should be + written to move_charge_at_immigrate. + + 9.10 Memory thresholds + Memory controller implements memory thresholds using cgroups notification + API. You can use tools/cgroup/cgroup_event_listener.c to test it. + + (Shell-A) Create cgroup and run event listener + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt new file mode 100644 index 000000000..ff71e16cc --- /dev/null +++ b/Documentation/cgroup-v1/memory.txt @@ -0,0 +1,876 @@ +Memory Resource Controller + +NOTE: This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + +NOTE: The Memory Resource Controller has generically been referred to as the + memory controller in this document. Do not confuse memory controller + used here with the memory controller that is used in hardware. + +(For editors) +In this document: + When we mention a cgroup (cgroupfs's directory) with memory controller, + we call it "memory cgroup". When you see git-log and source code, you'll + see patch's title and function names tend to use "memcg". + In this document, we avoid using it. + +Benefits and Purpose of the memory controller + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory-hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with a limited amount of memory; this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases; find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +Current Status: linux-2.6.34-mmotm(development version of 2010/April) + +Features: + - accounting anonymous pages, file caches, swap caches usage and limiting them. + - pages are linked to per-memcg LRU exclusively, and there is no global LRU. + - optionally, memory+swap usage can be accounted and limited. + - hierarchical accounting + - soft limit + - moving (recharging) account at moving a task is selectable. + - usage threshold notifier + - memory pressure notifier + - oom-killer disable knob and oom-notifier + - Root cgroup has no limit controls. + + Kernel memory support is a work in progress, and the current version provides + basically functionality. (See Section 2.7) + +Brief summary of control files. + + tasks # attach a task(thread) and show list of threads + cgroup.procs # show list of processes + cgroup.event_control # an interface for event_fd() + memory.usage_in_bytes # show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes # show current usage for memory+Swap + (See 5.5 for details) + memory.limit_in_bytes # set/show limit of memory usage + memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage + memory.failcnt # show the number of memory usage hits limits + memory.memsw.failcnt # show the number of memory+Swap hits limits + memory.max_usage_in_bytes # show max memory usage recorded + memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded + memory.soft_limit_in_bytes # set/show soft limit of memory usage + memory.stat # show various statistics + memory.use_hierarchy # set/show hierarchical account enabled + memory.force_empty # trigger forced move charge to parent + memory.pressure_level # set memory pressure notifications + memory.swappiness # set/show swappiness parameter of vmscan + (See sysctl's vm.swappiness) + memory.move_charge_at_immigrate # set/show controls of moving charges + memory.oom_control # set/show oom controls. + memory.numa_stat # show the number of memory usage per numa node + + memory.kmem.limit_in_bytes # set/show hard limit for kernel memory + memory.kmem.usage_in_bytes # show current kernel memory allocation + memory.kmem.failcnt # show the number of kernel memory usage hits limits + memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded + + memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory + memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation + memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits + memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded + +1. History + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design + +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. + +2.2. Accounting + + +--------------------+ + | mem_cgroup | + | (page_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge_common() is invoked to +set up the necessary data structures and check if the cgroup that is being +charged is over its limit. If it is, then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +updated. page_cgroup has its own LRU on cgroup. +(*) page_cgroup structure is allocated at boot/memory-hotplug time. + +2.2.1 Accounting details + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +Some pages which are never reclaimable and will not be on the LRU +are not accounted. We just account pages under usual VM management. + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +An RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unmapped (by kswapd), they may exist as SwapCache in the system until they +are really freed. Such SwapCaches are also accounted. +A swapped-in page is not accounted until it's mapped. + +Note: The kernel does swapin-readahead and reads multiple swaps at once. +This means swapped-in pages may contain pages for other tasks than a task +causing page fault. So, we avoid accounting at swap-in I/O. + +At page migration, accounting information is kept. + +Note: we just account pages-on-LRU because our purpose is to control amount +of used pages; not-on-LRU pages tend to be out-of-control from VM view. + +2.3 Shared Page Accounting + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + +Exception: If CONFIG_MEMCG_SWAP is not used. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + +2.4 Swap Extension (CONFIG_MEMCG_SWAP) + +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +memsw means memory+swap. Usage of memory+swap is limited by +memsw.limit_in_bytes. + +Example: Assume a system with 4G of swap. A task which allocates 6G of memory +(by mistake) under 2G memory limitation will use all swap. +In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. +By using the memsw limit, you can avoid system OOM which can be caused by swap +shortage. + +* why 'memory+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +memory+swap. In other words, when we want to limit the usage of swap without +affecting global LRU, memory+swap limit is better than just limiting swap from +an OS point of view. + +* What happens when a cgroup hits memory.memsw.limit_in_bytes +When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out +in this cgroup. Then, swap-out will not be done by cgroup routine and file +caches are dropped. But as mentioned above, global LRU can do swapout memory +from it for sanity of the system's memory management state. You can't forbid +it by cgroup. + +2.5 Reclaim + +Each cgroup maintains a per cgroup LRU which has the same structure as +global VM. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. (See 10. OOM Control below.) + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per-cgroup LRU +list. + +NOTE: Reclaim does not work for the root cgroup, since we cannot set any +limits on the root cgroup. + +Note2: When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered. +(See oom_control section) + +2.6 Locking + + lock_page_cgroup()/unlock_page_cgroup() should not be called under + mapping->tree_lock. + + Other lock order is following: + PG_locked. + mm->page_table_lock + zone->lru_lock + lock_page_cgroup. + In many cases, just lock_page_cgroup() is called. + per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by + zone->lru_lock, it has no lock of its own. + +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) + +With the Kernel memory extension, the Memory Controller is able to limit +the amount of kernel memory used by the system. Kernel memory is fundamentally +different than user memory, since it can't be swapped out, which makes it +possible to DoS the system by consuming too much of this precious resource. + +Kernel memory won't be accounted at all until limit on a group is set. This +allows for existing setups to continue working without disruption. The limit +cannot be set if the cgroup have children, or if there are already tasks in the +cgroup. Attempting to set the limit under those conditions will return -EBUSY. +When use_hierarchy == 1 and a group is accounted, its children will +automatically be accounted regardless of their limit value. + +After a group is first limited, it will be kept being accounted until it +is removed. The memory limitation itself, can of course be removed by writing +-1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not +limited. + +Kernel memory limits are not imposed for the root cgroup. Usage for the root +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. + +Currently no soft limit is implemented for kernel memory. It is future work +to trigger slab reclaim when those limits are reached. + +2.7.1 Current Kernel Memory resources accounted + +* stack pages: every process consumes some stack pages. By accounting into +kernel memory, we prevent new processes from being created when the kernel +memory usage is too high. + +* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy +of each kmem_cache is created every time the cache is touched by the first time +from inside the memcg. The creation is done lazily, so some objects can still be +skipped while the cache is being created. All objects in a slab page should +belong to the same memcg. This only fails to hold when a task is migrated to a +different memcg during the page allocation by the cache. + +* sockets memory pressure: some sockets protocols have memory pressure +thresholds. The Memory Controller allows them to be controlled individually +per cgroup, instead of globally. + +* tcp memory pressure: sockets memory pressure for the tcp protocol. + +2.7.2 Common use cases + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + + U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + + U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + WARNING: In the current implementation, memory reclaim will NOT be + triggered for a cgroup when it hits K while staying below U, which makes + this setup impractical. + + U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + +3. User Interface + +3.0. Configuration + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) + +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +# mount -t tmpfs none /sys/fs/cgroup +# mkdir /sys/fs/cgroup/memory +# mount -t cgroup none /sys/fs/cgroup/memory -o memory + +3.2. Make the new group and move bash into it +# mkdir /sys/fs/cgroup/memory/0 +# echo $$ > /sys/fs/cgroup/memory/0/tasks + +Since now we're in the 0 cgroup, we can alter the memory limit: +# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, +mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) + +NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). +NOTE: We cannot set limits on the root cgroup any more. + +# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes +4194304 + +We can check the usage: +# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes +1216512 + +A successful write to this file does not guarantee a successful setting of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel. + +# echo 1 > memory.limit_in_bytes +# cat memory.limit_in_bytes +4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing + +For testing features and implementation, see memcg_test.txt. + +Performance test is also important. To see pure memory controller's overhead, +testing on tmpfs will give you good numbers of small overheads. +Example: do kernel make on tmpfs. + +Page-fault scalability is also important. At measuring parallel +page fault test, multi-process test may be better than multi-thread +test because it has noise of shared objects/status. + +But the above two are testing extreme situations. +Trying usual test under memory controller is always helpful. + +4.1 Troubleshooting + +Sometimes a user might find that the application under a cgroup is +terminated by the OOM killer. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and +seeing what happens will be helpful. + +4.2 Task migration + +When a task migrates from one cgroup to another, its charge is not +carried forward by default. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +You can move charges of a task along with task migration. +See 8. "Move charges at task migration" + +4.3 Removing a cgroup + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. (because we charge against pages, not +against tasks.) + +We move the stats to root (if use_hierarchy==0) or parent (if +use_hierarchy==1), and no change on the charge except uncharging +from the child. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + +About use_hierarchy, see Section 6. + +5. Misc. interfaces. + +5.1 force_empty + memory.force_empty interface is provided to make cgroup's memory usage empty. + When writing anything to this + + # echo 0 > memory.force_empty + + the cgroup will be reclaimed and as many pages reclaimed as possible. + + The typical use case for this interface is before calling rmdir(). + Because rmdir() moves all pages to parent, some out-of-use page caches can be + moved to the parent. If you want to avoid that, force_empty will be useful. + + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + + About use_hierarchy, see Section 6. + +5.2 stat file + +memory.stat file includes following statistics + +# per-memory cgroup local status +cache - # of bytes of page cache memory. +rss - # of bytes of anonymous and swap cache memory (includes + transparent hugepages). +rss_huge - # of bytes of anonymous transparent hugepages. +mapped_file - # of bytes of mapped file (includes tmpfs/shmem) +pgpgin - # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout - # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. +swap - # of bytes of swap usage +dirty - # of bytes that are waiting to get written back to the disk. +writeback - # of bytes of file/anon cache that are queued for syncing to + disk. +inactive_anon - # of bytes of anonymous and swap cache memory on inactive + LRU list. +active_anon - # of bytes of anonymous and swap cache memory on active + LRU list. +inactive_file - # of bytes of file-backed memory on inactive LRU list. +active_file - # of bytes of file-backed memory on active LRU list. +unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). + +# status considering hierarchy (see memory.use_hierarchy settings) + +hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy + under which the memory cgroup is +hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + +total_ - # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache + +# The following additional stats are dependent on CONFIG_DEBUG_VM. + +recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) +recent_rotated_file - VM internal parameter. (see mm/vmscan.c) +recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) +recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + +Memo: + recent_rotated means recent frequency of LRU rotation. + recent_scanned means recent # of scans to LRU. + showing for better debug please see the code for meanings. + +Note: + Only anonymous and swap cache memory is listed as part of 'rss' stat. + This should not be confused with the true 'resident set size' or the + amount of physical memory used by the cgroup. + 'rss + file_mapped" will give you resident set size of cgroup. + (Note: file and shmem may be shared among other cgroups. In that case, + file_mapped is accounted only when the memory cgroup is owner of page + cache.) + +5.3 swappiness + +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. + +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. + +5.4 failcnt + +A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. +This failcnt(== failure count) shows the number of times that a usage counter +hit its limit. When a memory cgroup hits a limit, failcnt increases and +memory under it will be reclaimed. + +You can reset failcnt by writing 0 to failcnt file. +# echo 0 > .../memory.failcnt + +5.5 usage_in_bytes + +For efficiency, as other kernel components, memory cgroup uses some optimization +to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the +method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz +value for efficient access. (Of course, when necessary, it's synchronized.) +If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) +value in memory.stat(see 5.2). + +5.6 numa_stat + +This is similar to numa_maps but operates on a per-memcg basis. This is +useful for providing visibility into the numa locality information within +an memcg since the pages are allowed to be allocated from any physical +node. One of the use cases is evaluating application performance by +combining this information with the application's CPU allocation. + +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The output format of memory.numa_stat is: + +total= N0= N1= ... +file= N0= N1= ... +anon= N0= N1= ... +unevictable= N0= N1= ... +hierarchical_= N0= N1= ... + +The "total" count is sum of file + anon + unevictable. + +6. Hierarchy support + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim + +A memory cgroup by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup + +# echo 1 > memory.use_hierarchy + +The feature can be disabled by + +# echo 0 > memory.use_hierarchy + +NOTE1: Enabling/disabling will fail if either the cgroup already has other + cgroups created below it, or if the parent cgroup has use_hierarchy + enabled. + +NOTE2: When panic_on_oom is set to "2", the whole system will panic in + case of an OOM event in any cgroup. + +7. Soft limits + +Soft limits allow for greater sharing of memory. The idea behind soft limits +is to allow control groups to use as much of the memory as needed, provided + +a. There is no memory contention +b. They do not exceed their hard limit + +When the system detects memory contention or low memory, control groups +are pushed back to their soft limits. If the soft limit of each control +group is very high, they are pushed back as much as possible to make +sure that one control group does not starve the others of memory. + +Please note that soft limits is a best-effort feature; it comes with +no guarantees, but it does its best to make sure that when memory is +heavily contended for, memory is allocated based on the soft limit +hints/setup. Currently soft limit based reclaim is set up such that +it gets invoked from balance_pgdat (kswapd). + +7.1 Interface + +Soft limits can be setup by using the following commands (in this example we +assume a soft limit of 256 MiB) + +# echo 256M > memory.soft_limit_in_bytes + +If we want to change this to 1G, we can at any time use + +# echo 1G > memory.soft_limit_in_bytes + +NOTE1: Soft limits take effect over a long period of time, since they involve + reclaiming memory for balancing between memory cgroups +NOTE2: It is recommended to set the soft limit always below the hard limit, + otherwise the hard limit will take precedence. + +8. Move charges at task migration + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface + +This feature is disabled by default. It can be enabled (and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it: + +# echo (some positive value) > memory.move_charge_at_immigrate + +Note: Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: Charges are moved only when you move mm->owner, in other words, + a leader of a thread group. +Note: If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: It can take several seconds if you move charges much. + +And if you want disable it again: + +# echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be moved + +Each bit in move_charge_at_immigrate has its own meaning about what type of +charges should be moved. But in any case, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current +(old) memory cgroup. + + bit | what type of charges would be moved ? + -----+------------------------------------------------------------------------ + 0 | A charge of an anonymous page (or swap of it) used by the target task. + | You must enable Swap Extension (see 2.4) to enable move of swap charges. + -----+------------------------------------------------------------------------ + 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) + | and swaps of tmpfs file) mmapped by the target task. Unlike the case of + | anonymous pages, file pages (and swaps) in the range mmapped by the task + | will be moved even if the task hasn't done page fault, i.e. they might + | not be the task's "RSS", but other task's "RSS" that maps the same file. + | And mapcount of the page is ignored (the page can be moved even if + | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to + | enable move of swap charges. + +8.3 TODO + +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds + +Memory cgroup implements memory thresholds using the cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold, an application must: +- create an eventfd using eventfd(2); +- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; +- write string like " " to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. OOM Control + +memory.oom_control file is for OOM notification and other controls. + +Memory cgroup implements OOM notifier using the cgroup notification +API (See cgroups.txt). It allows to register multiple OOM notification +delivery and gets notification when OOM happens. + +To register a notifier, an application must: + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to + cgroup.event_control + +The application will be notified through eventfd when OOM happens. +OOM notification doesn't work for the root cgroup. + +You can disable the OOM-killer by writing "1" to memory.oom_control file, as: + + #echo 1 > memory.oom_control + +If OOM-killer is disabled, tasks under cgroup will hang/sleep +in memory cgroup's OOM-waitqueue when they request accountable memory. + +For running them, you have to relax the memory cgroup's OOM status by + * enlarge limit or reduce usage. +To reduce usage, + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) + under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may + be stopped.) + +11. Memory Pressure + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +The events are propagated upward until the event is handled, i.e. the +events are not pass-through. Here is what this means: for example you have +three cgroups: A->B->C. Now you set up an event listener on cgroups A, B +and C, and suppose group C experiences some pressure. In this situation, +only group C will receive the notification, i.e. groups A and B will not +receive it. This is done to avoid excessive "broadcasting" of messages, +which disturbs the system and which is especially bad if we are low on +memory or thrashing. So, organize the cgroups wisely, or propagate the +events manually (or, ask us to implement the pass-through events, +explaining why would you need them.) + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string like " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO + +1. Make per-cgroup scanner reclaim not-shared pages first +2. Teach controller to account for shared-pages +3. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt new file mode 100644 index 000000000..ec182346d --- /dev/null +++ b/Documentation/cgroup-v1/net_cls.txt @@ -0,0 +1,39 @@ +Network classifier cgroup +------------------------- + +The Network classifier cgroup provides an interface to +tag network packets with a class identifier (classid). + +The Traffic Controller (tc) can be used to assign +different priorities to packets from different cgroups. +Also, Netfilter (iptables) can use this tag to perform +actions on such packets. + +Creating a net_cls cgroups instance creates a net_cls.classid file. +This net_cls.classid value is initialized to 0. + +You can write hexadecimal values to net_cls.classid; the format for these +values is 0xAAAABBBB; AAAA is the major handle number and BBBB +is the minor handle number. +Reading net_cls.classid yields a decimal result. + +Example: +mkdir /sys/fs/cgroup/net_cls +mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls +mkdir /sys/fs/cgroup/net_cls/0 +echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + - setting a 10:1 handle. + +cat /sys/fs/cgroup/net_cls/0/net_cls.classid +1048577 + +configuring tc: +tc qdisc add dev eth0 root handle 10: htb + +tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + - creating traffic class 10:1 + +tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +configuring iptables, basic example: +iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt new file mode 100644 index 000000000..a82cbd28e --- /dev/null +++ b/Documentation/cgroup-v1/net_prio.txt @@ -0,0 +1,55 @@ +Network priority cgroup +------------------------- + +The Network priority cgroup provides an interface to allow an administrator to +dynamically set the priority of network traffic generated by various +applications + +Nominally, an application would set the priority of its traffic via the +SO_PRIORITY socket option. This however, is not always possible because: + +1) The application may not have been coded to set this value +2) The priority of application traffic is often a site-specific administrative + decision rather than an application defined one. + +This cgroup allows an administrator to assign a process to a group which defines +the priority of egress traffic on a given interface. Network priority groups can +be created by first mounting the cgroup filesystem. + +# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +With the above step, the initial group acting as the parent accounting group +becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in +the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. + +Each net_prio cgroup contains two files that are subsystem specific + +net_prio.prioidx +This file is read-only, and is simply informative. It contains a unique integer +value that the kernel uses as an internal representation of this cgroup. + +net_prio.ifpriomap +This file contains a map of the priorities assigned to traffic originating from +processes in this group and egressing the system on various interfaces. It +contains a list of tuples in the form . Contents of this file +can be modified by echoing a string into the file using the same tuple format. +for example: + +echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap + +This command would force any traffic originating from processes belonging to the +iscsi net_prio cgroup and egressing on interface eth0 to have the priority of +said traffic set to the value 5. The parent accounting group also has a +writeable 'net_prio.ifpriomap' file that can be used to set a system default +priority. + +Priorities are set immediately prior to queueing a frame to the device +queueing discipline (qdisc) so priorities will be assigned prior to the hardware +queue selection being made. + +One usage for the net_prio cgroup is with mqprio qdisc allowing application +traffic to be steered to hardware/driver based traffic classes. These mappings +can then be managed by administrators or other networking protocols such as +DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt new file mode 100644 index 000000000..1a078b5d2 --- /dev/null +++ b/Documentation/cgroup-v1/pids.txt @@ -0,0 +1,85 @@ + Process Number Controller + ========================= + +Abstract +-------- + +The process number controller is used to allow a cgroup hierarchy to stop any +new tasks from being fork()'d or clone()'d after a certain limit is reached. + +Since it is trivial to hit the task limit without hitting any kmemcg limits in +place, PIDs are a fundamental resource. As such, PID exhaustion must be +preventable in the scope of a cgroup hierarchy by allowing resource limiting of +the number of tasks in a cgroup. + +Usage +----- + +In order to use the `pids` controller, set the maximum number of tasks in +pids.max (this is not available in the root cgroup for obvious reasons). The +number of processes currently in the cgroup is given by pids.current. + +Organisational operations are not blocked by cgroup policies, so it is possible +to have pids.current > pids.max. This can be done by either setting the limit to +be smaller than pids.current, or attaching enough processes to the cgroup such +that pids.current > pids.max. However, it is not possible to violate a cgroup +policy through fork() or clone(). fork() and clone() will return -EAGAIN if the +creation of a new process would cause a cgroup policy to be violated. + +To set a cgroup to have no limit, set pids.max to "max". This is the default for +all new cgroups (N.B. that PID limits are hierarchical, so the most stringent +limit in the hierarchy is followed). + +pids.current tracks all child cgroup hierarchies, so parent/pids.current is a +superset of parent/child/pids.current. + +Example +------- + +First, we mount the pids controller: +# mkdir -p /sys/fs/cgroup/pids +# mount -t cgroup -o pids none /sys/fs/cgroup/pids + +Then we create a hierarchy, set limits and attach processes to it: +# mkdir -p /sys/fs/cgroup/pids/parent/child +# echo 2 > /sys/fs/cgroup/pids/parent/pids.max +# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs +# cat /sys/fs/cgroup/pids/parent/pids.current +2 +# + +It should be noted that attempts to overcome the set limit (2 in this case) will +fail: + +# cat /sys/fs/cgroup/pids/parent/pids.current +2 +# ( /bin/echo "Here's some processes for you." | cat ) +sh: fork: Resource temporary unavailable +# + +Even if we migrate to a child cgroup (which doesn't have a set limit), we will +not be able to overcome the most stringent limit in the hierarchy (in this case, +parent's): + +# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs +# cat /sys/fs/cgroup/pids/parent/pids.current +2 +# cat /sys/fs/cgroup/pids/parent/child/pids.current +2 +# cat /sys/fs/cgroup/pids/parent/child/pids.max +max +# ( /bin/echo "Here's some processes for you." | cat ) +sh: fork: Resource temporary unavailable +# + +We can set a limit that is smaller than pids.current, which will stop any new +processes from being forked at all (note that the shell itself counts towards +pids.current): + +# echo 1 > /sys/fs/cgroup/pids/parent/pids.max +# /bin/echo "We can't even spawn a single process now." +sh: fork: Resource temporary unavailable +# echo 0 > /sys/fs/cgroup/pids/parent/pids.max +# /bin/echo "We can't even spawn a single process now." +sh: fork: Resource temporary unavailable +# diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt new file mode 100644 index 000000000..ff49cf901 --- /dev/null +++ b/Documentation/cgroup-v2.txt @@ -0,0 +1,1386 @@ + +Control Group v2 + +October, 2015 Tejun Heo + +This is the authoritative documentation on the design, interface and +conventions of cgroup v2. It describes all userland-visible aspects +of cgroup including core and specific controller behaviors. All +future changes must be reflected in this document. Documentation for +v1 is available under Documentation/cgroup-v1/. + +CONTENTS + +1. Introduction + 1-1. Terminology + 1-2. What is cgroup? +2. Basic Operations + 2-1. Mounting + 2-2. Organizing Processes + 2-3. [Un]populated Notification + 2-4. Controlling Controllers + 2-4-1. Enabling and Disabling + 2-4-2. Top-down Constraint + 2-4-3. No Internal Process Constraint + 2-5. Delegation + 2-5-1. Model of Delegation + 2-5-2. Delegation Containment + 2-6. Guidelines + 2-6-1. Organize Once and Control + 2-6-2. Avoid Name Collisions +3. Resource Distribution Models + 3-1. Weights + 3-2. Limits + 3-3. Protections + 3-4. Allocations +4. Interface Files + 4-1. Format + 4-2. Conventions + 4-3. Core Interface Files +5. Controllers + 5-1. CPU + 5-1-1. CPU Interface Files + 5-2. Memory + 5-2-1. Memory Interface Files + 5-2-2. Usage Guidelines + 5-2-3. Memory Ownership + 5-3. IO + 5-3-1. IO Interface Files + 5-3-2. Writeback +P. Information on Kernel Programming + P-1. Filesystem Support for Writeback +D. Deprecated v1 Core Features +R. Issues with v1 and Rationales for v2 + R-1. Multiple Hierarchies + R-2. Thread Granularity + R-3. Competition Between Inner Nodes and Threads + R-4. Other Interface Issues + R-5. Controller Issues and Remedies + R-5-1. Memory + + +1. Introduction + +1-1. Terminology + +"cgroup" stands for "control group" and is never capitalized. The +singular form is used to designate the whole feature and also as a +qualifier as in "cgroup controllers". When explicitly referring to +multiple individual control groups, the plural form "cgroups" is used. + + +1-2. What is cgroup? + +cgroup is a mechanism to organize processes hierarchically and +distribute system resources along the hierarchy in a controlled and +configurable manner. + +cgroup is largely composed of two parts - the core and controllers. +cgroup core is primarily responsible for hierarchically organizing +processes. A cgroup controller is usually responsible for +distributing a specific type of system resource along the hierarchy +although there are utility controllers which serve purposes other than +resource distribution. + +cgroups form a tree structure and every process in the system belongs +to one and only one cgroup. All threads of a process belong to the +same cgroup. On creation, all processes are put in the cgroup that +the parent process belongs to at the time. A process can be migrated +to another cgroup. Migration of a process doesn't affect already +existing descendant processes. + +Following certain structural constraints, controllers may be enabled or +disabled selectively on a cgroup. All controller behaviors are +hierarchical - if a controller is enabled on a cgroup, it affects all +processes which belong to the cgroups consisting the inclusive +sub-hierarchy of the cgroup. When a controller is enabled on a nested +cgroup, it always restricts the resource distribution further. The +restrictions set closer to the root in the hierarchy can not be +overridden from further away. + + +2. Basic Operations + +2-1. Mounting + +Unlike v1, cgroup v2 has only single hierarchy. The cgroup v2 +hierarchy can be mounted with the following mount command. + + # mount -t cgroup2 none $MOUNT_POINT + +cgroup2 filesystem has the magic number 0x63677270 ("cgrp"). All +controllers which support v2 and are not bound to a v1 hierarchy are +automatically bound to the v2 hierarchy and show up at the root. +Controllers which are not in active use in the v2 hierarchy can be +bound to other hierarchies. This allows mixing v2 hierarchy with the +legacy v1 multiple hierarchies in a fully backward compatible way. + +A controller can be moved across hierarchies only after the controller +is no longer referenced in its current hierarchy. Because per-cgroup +controller states are destroyed asynchronously and controllers may +have lingering references, a controller may not show up immediately on +the v2 hierarchy after the final umount of the previous hierarchy. +Similarly, a controller should be fully disabled to be moved out of +the unified hierarchy and it may take some time for the disabled +controller to become available for other hierarchies; furthermore, due +to inter-controller dependencies, other controllers may need to be +disabled too. + +While useful for development and manual configurations, moving +controllers dynamically between the v2 and other hierarchies is +strongly discouraged for production use. It is recommended to decide +the hierarchies and controller associations before starting using the +controllers after system boot. + + +2-2. Organizing Processes + +Initially, only the root cgroup exists to which all processes belong. +A child cgroup can be created by creating a sub-directory. + + # mkdir $CGROUP_NAME + +A given cgroup may have multiple child cgroups forming a tree +structure. Each cgroup has a read-writable interface file +"cgroup.procs". When read, it lists the PIDs of all processes which +belong to the cgroup one-per-line. The PIDs are not ordered and the +same PID may show up more than once if the process got moved to +another cgroup and then back or the PID got recycled while reading. + +A process can be migrated into a cgroup by writing its PID to the +target cgroup's "cgroup.procs" file. Only one process can be migrated +on a single write(2) call. If a process is composed of multiple +threads, writing the PID of any thread migrates all threads of the +process. + +When a process forks a child process, the new process is born into the +cgroup that the forking process belongs to at the time of the +operation. After exit, a process stays associated with the cgroup +that it belonged to at the time of exit until it's reaped; however, a +zombie process does not appear in "cgroup.procs" and thus can't be +moved to another cgroup. + +A cgroup which doesn't have any children or live processes can be +destroyed by removing the directory. Note that a cgroup which doesn't +have any children and is associated only with zombie processes is +considered empty and can be removed. + + # rmdir $CGROUP_NAME + +"/proc/$PID/cgroup" lists a process's cgroup membership. If legacy +cgroup is in use in the system, this file may contain multiple lines, +one for each hierarchy. The entry for cgroup v2 is always in the +format "0::$PATH". + + # cat /proc/842/cgroup + ... + 0::/test-cgroup/test-cgroup-nested + +If the process becomes a zombie and the cgroup it was associated with +is removed subsequently, " (deleted)" is appended to the path. + + # cat /proc/842/cgroup + ... + 0::/test-cgroup/test-cgroup-nested (deleted) + + +2-3. [Un]populated Notification + +Each non-root cgroup has a "cgroup.events" file which contains +"populated" field indicating whether the cgroup's sub-hierarchy has +live processes in it. Its value is 0 if there is no live process in +the cgroup and its descendants; otherwise, 1. poll and [id]notify +events are triggered when the value changes. This can be used, for +example, to start a clean-up operation after all processes of a given +sub-hierarchy have exited. The populated state updates and +notifications are recursive. Consider the following sub-hierarchy +where the numbers in the parentheses represent the numbers of processes +in each cgroup. + + A(4) - B(0) - C(1) + \ D(0) + +A, B and C's "populated" fields would be 1 while D's 0. After the one +process in C exits, B and C's "populated" fields would flip to "0" and +file modified events will be generated on the "cgroup.events" files of +both cgroups. + + +2-4. Controlling Controllers + +2-4-1. Enabling and Disabling + +Each cgroup has a "cgroup.controllers" file which lists all +controllers available for the cgroup to enable. + + # cat cgroup.controllers + cpu io memory + +No controller is enabled by default. Controllers can be enabled and +disabled by writing to the "cgroup.subtree_control" file. + + # echo "+cpu +memory -io" > cgroup.subtree_control + +Only controllers which are listed in "cgroup.controllers" can be +enabled. When multiple operations are specified as above, either they +all succeed or fail. If multiple operations on the same controller +are specified, the last one is effective. + +Enabling a controller in a cgroup indicates that the distribution of +the target resource across its immediate children will be controlled. +Consider the following sub-hierarchy. The enabled controllers are +listed in parentheses. + + A(cpu,memory) - B(memory) - C() + \ D() + +As A has "cpu" and "memory" enabled, A will control the distribution +of CPU cycles and memory to its children, in this case, B. As B has +"memory" enabled but not "CPU", C and D will compete freely on CPU +cycles but their division of memory available to B will be controlled. + +As a controller regulates the distribution of the target resource to +the cgroup's children, enabling it creates the controller's interface +files in the child cgroups. In the above example, enabling "cpu" on B +would create the "cpu." prefixed controller interface files in C and +D. Likewise, disabling "memory" from B would remove the "memory." +prefixed controller interface files from C and D. This means that the +controller interface files - anything which doesn't start with +"cgroup." are owned by the parent rather than the cgroup itself. + + +2-4-2. Top-down Constraint + +Resources are distributed top-down and a cgroup can further distribute +a resource only if the resource has been distributed to it from the +parent. This means that all non-root "cgroup.subtree_control" files +can only contain controllers which are enabled in the parent's +"cgroup.subtree_control" file. A controller can be enabled only if +the parent has the controller enabled and a controller can't be +disabled if one or more children have it enabled. + + +2-4-3. No Internal Process Constraint + +Non-root cgroups can only distribute resources to their children when +they don't have any processes of their own. In other words, only +cgroups which don't contain any processes can have controllers enabled +in their "cgroup.subtree_control" files. + +This guarantees that, when a controller is looking at the part of the +hierarchy which has it enabled, processes are always only on the +leaves. This rules out situations where child cgroups compete against +internal processes of the parent. + +The root cgroup is exempt from this restriction. Root contains +processes and anonymous resource consumption which can't be associated +with any other cgroups and requires special treatment from most +controllers. How resource consumption in the root cgroup is governed +is up to each controller. + +Note that the restriction doesn't get in the way if there is no +enabled controller in the cgroup's "cgroup.subtree_control". This is +important as otherwise it wouldn't be possible to create children of a +populated cgroup. To control resource distribution of a cgroup, the +cgroup must create children and transfer all its processes to the +children before enabling controllers in its "cgroup.subtree_control" +file. + + +2-5. Delegation + +2-5-1. Model of Delegation + +A cgroup can be delegated to a less privileged user by granting write +access of the directory and its "cgroup.procs" file to the user. Note +that resource control interface files in a given directory control the +distribution of the parent's resources and thus must not be delegated +along with the directory. + +Once delegated, the user can build sub-hierarchy under the directory, +organize processes as it sees fit and further distribute the resources +it received from the parent. The limits and other settings of all +resource controllers are hierarchical and regardless of what happens +in the delegated sub-hierarchy, nothing can escape the resource +restrictions imposed by the parent. + +Currently, cgroup doesn't impose any restrictions on the number of +cgroups in or nesting depth of a delegated sub-hierarchy; however, +this may be limited explicitly in the future. + + +2-5-2. Delegation Containment + +A delegated sub-hierarchy is contained in the sense that processes +can't be moved into or out of the sub-hierarchy by the delegatee. For +a process with a non-root euid to migrate a target process into a +cgroup by writing its PID to the "cgroup.procs" file, the following +conditions must be met. + +- The writer's euid must match either uid or suid of the target process. + +- The writer must have write access to the "cgroup.procs" file. + +- The writer must have write access to the "cgroup.procs" file of the + common ancestor of the source and destination cgroups. + +The above three constraints ensure that while a delegatee may migrate +processes around freely in the delegated sub-hierarchy it can't pull +in from or push out to outside the sub-hierarchy. + +For an example, let's assume cgroups C0 and C1 have been delegated to +user U0 who created C00, C01 under C0 and C10 under C1 as follows and +all processes under C0 and C1 belong to U0. + + ~~~~~~~~~~~~~ - C0 - C00 + ~ cgroup ~ \ C01 + ~ hierarchy ~ + ~~~~~~~~~~~~~ - C1 - C10 + +Let's also say U0 wants to write the PID of a process which is +currently in C10 into "C00/cgroup.procs". U0 has write access to the +file and uid match on the process; however, the common ancestor of the +source cgroup C10 and the destination cgroup C00 is above the points +of delegation and U0 would not have write access to its "cgroup.procs" +files and thus the write will be denied with -EACCES. + + +2-6. Guidelines + +2-6-1. Organize Once and Control + +Migrating a process across cgroups is a relatively expensive operation +and stateful resources such as memory are not moved together with the +process. This is an explicit design decision as there often exist +inherent trade-offs between migration and various hot paths in terms +of synchronization cost. + +As such, migrating processes across cgroups frequently as a means to +apply different resource restrictions is discouraged. A workload +should be assigned to a cgroup according to the system's logical and +resource structure once on start-up. Dynamic adjustments to resource +distribution can be made by changing controller configuration through +the interface files. + + +2-6-2. Avoid Name Collisions + +Interface files for a cgroup and its children cgroups occupy the same +directory and it is possible to create children cgroups which collide +with interface files. + +All cgroup core interface files are prefixed with "cgroup." and each +controller's interface files are prefixed with the controller name and +a dot. A controller's name is composed of lower case alphabets and +'_'s but never begins with an '_' so it can be used as the prefix +character for collision avoidance. Also, interface file names won't +start or end with terms which are often used in categorizing workloads +such as job, service, slice, unit or workload. + +cgroup doesn't do anything to prevent name collisions and it's the +user's responsibility to avoid them. + + +3. Resource Distribution Models + +cgroup controllers implement several resource distribution schemes +depending on the resource type and expected use cases. This section +describes major schemes in use along with their expected behaviors. + + +3-1. Weights + +A parent's resource is distributed by adding up the weights of all +active children and giving each the fraction matching the ratio of its +weight against the sum. As only children which can make use of the +resource at the moment participate in the distribution, this is +work-conserving. Due to the dynamic nature, this model is usually +used for stateless resources. + +All weights are in the range [1, 10000] with the default at 100. This +allows symmetric multiplicative biases in both directions at fine +enough granularity while staying in the intuitive range. + +As long as the weight is in range, all configuration combinations are +valid and there is no reason to reject configuration changes or +process migrations. + +"cpu.weight" proportionally distributes CPU cycles to active children +and is an example of this type. + + +3-2. Limits + +A child can only consume upto the configured amount of the resource. +Limits can be over-committed - the sum of the limits of children can +exceed the amount of resource available to the parent. + +Limits are in the range [0, max] and defaults to "max", which is noop. + +As limits can be over-committed, all configuration combinations are +valid and there is no reason to reject configuration changes or +process migrations. + +"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume +on an IO device and is an example of this type. + + +3-3. Protections + +A cgroup is protected to be allocated upto the configured amount of +the resource if the usages of all its ancestors are under their +protected levels. Protections can be hard guarantees or best effort +soft boundaries. Protections can also be over-committed in which case +only upto the amount available to the parent is protected among +children. + +Protections are in the range [0, max] and defaults to 0, which is +noop. + +As protections can be over-committed, all configuration combinations +are valid and there is no reason to reject configuration changes or +process migrations. + +"memory.low" implements best-effort memory protection and is an +example of this type. + + +3-4. Allocations + +A cgroup is exclusively allocated a certain amount of a finite +resource. Allocations can't be over-committed - the sum of the +allocations of children can not exceed the amount of resource +available to the parent. + +Allocations are in the range [0, max] and defaults to 0, which is no +resource. + +As allocations can't be over-committed, some configuration +combinations are invalid and should be rejected. Also, if the +resource is mandatory for execution of processes, process migrations +may be rejected. + +"cpu.rt.max" hard-allocates realtime slices and is an example of this +type. + + +4. Interface Files + +4-1. Format + +All interface files should be in one of the following formats whenever +possible. + + New-line separated values + (when only one value can be written at once) + + VAL0\n + VAL1\n + ... + + Space separated values + (when read-only or multiple values can be written at once) + + VAL0 VAL1 ...\n + + Flat keyed + + KEY0 VAL0\n + KEY1 VAL1\n + ... + + Nested keyed + + KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... + KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... + ... + +For a writable file, the format for writing should generally match +reading; however, controllers may allow omitting later fields or +implement restricted shortcuts for most common use cases. + +For both flat and nested keyed files, only the values for a single key +can be written at a time. For nested keyed files, the sub key pairs +may be specified in any order and not all pairs have to be specified. + + +4-2. Conventions + +- Settings for a single feature should be contained in a single file. + +- The root cgroup should be exempt from resource control and thus + shouldn't have resource control interface files. Also, + informational files on the root cgroup which end up showing global + information available elsewhere shouldn't exist. + +- If a controller implements weight based resource distribution, its + interface file should be named "weight" and have the range [1, + 10000] with 100 as the default. The values are chosen to allow + enough and symmetric bias in both directions while keeping it + intuitive (the default is 100%). + +- If a controller implements an absolute resource guarantee and/or + limit, the interface files should be named "min" and "max" + respectively. If a controller implements best effort resource + guarantee and/or limit, the interface files should be named "low" + and "high" respectively. + + In the above four control files, the special token "max" should be + used to represent upward infinity for both reading and writing. + +- If a setting has a configurable default value and keyed specific + overrides, the default entry should be keyed with "default" and + appear as the first entry in the file. + + The default value can be updated by writing either "default $VAL" or + "$VAL". + + When writing to update a specific override, "default" can be used as + the value to indicate removal of the override. Override entries + with "default" as the value must not appear when read. + + For example, a setting which is keyed by major:minor device numbers + with integer values may look like the following. + + # cat cgroup-example-interface-file + default 150 + 8:0 300 + + The default value can be updated by + + # echo 125 > cgroup-example-interface-file + + or + + # echo "default 125" > cgroup-example-interface-file + + An override can be set by + + # echo "8:16 170" > cgroup-example-interface-file + + and cleared by + + # echo "8:0 default" > cgroup-example-interface-file + # cat cgroup-example-interface-file + default 125 + 8:16 170 + +- For events which are not very high frequency, an interface file + "events" should be created which lists event key value pairs. + Whenever a notifiable event happens, file modified event should be + generated on the file. + + +4-3. Core Interface Files + +All cgroup core files are prefixed with "cgroup." + + cgroup.procs + + A read-write new-line separated values file which exists on + all cgroups. + + When read, it lists the PIDs of all processes which belong to + the cgroup one-per-line. The PIDs are not ordered and the + same PID may show up more than once if the process got moved + to another cgroup and then back or the PID got recycled while + reading. + + A PID can be written to migrate the process associated with + the PID to the cgroup. The writer should match all of the + following conditions. + + - Its euid is either root or must match either uid or suid of + the target process. + + - It must have write access to the "cgroup.procs" file. + + - It must have write access to the "cgroup.procs" file of the + common ancestor of the source and destination cgroups. + + When delegating a sub-hierarchy, write access to this file + should be granted along with the containing directory. + + cgroup.controllers + + A read-only space separated values file which exists on all + cgroups. + + It shows space separated list of all controllers available to + the cgroup. The controllers are not ordered. + + cgroup.subtree_control + + A read-write space separated values file which exists on all + cgroups. Starts out empty. + + When read, it shows space separated list of the controllers + which are enabled to control resource distribution from the + cgroup to its children. + + Space separated list of controllers prefixed with '+' or '-' + can be written to enable or disable controllers. A controller + name prefixed with '+' enables the controller and '-' + disables. If a controller appears more than once on the list, + the last one is effective. When multiple enable and disable + operations are specified, either all succeed or all fail. + + cgroup.events + + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + populated + + 1 if the cgroup or its descendants contains any live + processes; otherwise, 0. + + +5. Controllers + +5-1. CPU + +[NOTE: The interface for the cpu controller hasn't been merged yet] + +The "cpu" controllers regulates distribution of CPU cycles. This +controller implements weight and absolute bandwidth limit models for +normal scheduling policy and absolute bandwidth allocation model for +realtime scheduling policy. + + +5-1-1. CPU Interface Files + +All time durations are in microseconds. + + cpu.stat + + A read-only flat-keyed file which exists on non-root cgroups. + + It reports the following six stats. + + usage_usec + user_usec + system_usec + nr_periods + nr_throttled + throttled_usec + + cpu.weight + + A read-write single value file which exists on non-root + cgroups. The default is "100". + + The weight in the range [1, 10000]. + + cpu.max + + A read-write two value file which exists on non-root cgroups. + The default is "max 100000". + + The maximum bandwidth limit. It's in the following format. + + $MAX $PERIOD + + which indicates that the group may consume upto $MAX in each + $PERIOD duration. "max" for $MAX indicates no limit. If only + one number is written, $MAX is updated. + + cpu.rt.max + + [NOTE: The semantics of this file is still under discussion and the + interface hasn't been merged yet] + + A read-write two value file which exists on all cgroups. + The default is "0 100000". + + The maximum realtime runtime allocation. Over-committing + configurations are disallowed and process migrations are + rejected if not enough bandwidth is available. It's in the + following format. + + $MAX $PERIOD + + which indicates that the group may consume upto $MAX in each + $PERIOD duration. If only one number is written, $MAX is + updated. + + +5-2. Memory + +The "memory" controller regulates distribution of memory. Memory is +stateful and implements both limit and protection models. Due to the +intertwining between memory usage and reclaim pressure and the +stateful nature of memory, the distribution model is relatively +complex. + +While not completely water-tight, all major memory usages by a given +cgroup are tracked so that the total memory consumption can be +accounted and controlled to a reasonable extent. Currently, the +following types of memory usages are tracked. + +- Userland memory - page cache and anonymous memory. + +- Kernel data structures such as dentries and inodes. + +- TCP socket buffers. + +The above list may expand in the future for better coverage. + + +5-2-1. Memory Interface Files + +All memory amounts are in bytes. If a value which is not aligned to +PAGE_SIZE is written, the value may be rounded up to the closest +PAGE_SIZE multiple when read back. + + memory.current + + A read-only single value file which exists on non-root + cgroups. + + The total amount of memory currently being used by the cgroup + and its descendants. + + memory.low + + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Best-effort memory protection. If the memory usages of a + cgroup and all its ancestors are below their low boundaries, + the cgroup's memory won't be reclaimed unless memory can be + reclaimed from unprotected cgroups. + + Putting more memory than generally available under this + protection is discouraged. + + memory.high + + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Memory usage throttle limit. This is the main mechanism to + control memory usage of a cgroup. If a cgroup's usage goes + over the high boundary, the processes of the cgroup are + throttled and put under heavy reclaim pressure. + + Going over the high limit never invokes the OOM killer and + under extreme conditions the limit may be breached. + + memory.max + + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Memory usage hard limit. This is the final protection + mechanism. If a cgroup's memory usage reaches this limit and + can't be reduced, the OOM killer is invoked in the cgroup. + Under certain circumstances, the usage may go over the limit + temporarily. + + This is the ultimate protection mechanism. As long as the + high limit is used and monitored properly, this limit's + utility is limited to providing the final safety net. + + memory.events + + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + low + + The number of times the cgroup is reclaimed due to + high memory pressure even though its usage is under + the low boundary. This usually indicates that the low + boundary is over-committed. + + high + + The number of times processes of the cgroup are + throttled and routed to perform direct memory reclaim + because the high memory boundary was exceeded. For a + cgroup whose memory usage is capped by the high limit + rather than global memory pressure, this event's + occurrences are expected. + + max + + The number of times the cgroup's memory usage was + about to go over the max boundary. If direct reclaim + fails to bring it down, the OOM killer is invoked. + + oom + + The number of times the OOM killer has been invoked in + the cgroup. This may not exactly match the number of + processes killed but should generally be close. + + memory.stat + + A read-only flat-keyed file which exists on non-root cgroups. + + This breaks down the cgroup's memory footprint into different + types of memory, type-specific details, and other information + on the state and past events of the memory management system. + + All memory amounts are in bytes. + + The entries are ordered to be human readable, and new entries + can show up in the middle. Don't rely on items remaining in a + fixed position; use the keys to look up specific values! + + anon + + Amount of memory used in anonymous mappings such as + brk(), sbrk(), and mmap(MAP_ANONYMOUS) + + file + + Amount of memory used to cache filesystem data, + including tmpfs and shared memory. + + sock + + Amount of memory used in network transmission buffers + + file_mapped + + Amount of cached filesystem data mapped with mmap() + + file_dirty + + Amount of cached filesystem data that was modified but + not yet written back to disk + + file_writeback + + Amount of cached filesystem data that was modified and + is currently being written back to disk + + inactive_anon + active_anon + inactive_file + active_file + unevictable + + Amount of memory, swap-backed and filesystem-backed, + on the internal memory management lists used by the + page reclaim algorithm + + pgfault + + Total number of page faults incurred + + pgmajfault + + Number of major page faults incurred + + memory.swap.current + + A read-only single value file which exists on non-root + cgroups. + + The total amount of swap currently being used by the cgroup + and its descendants. + + memory.swap.max + + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Swap usage hard limit. If a cgroup's swap usage reaches this + limit, anonymous meomry of the cgroup will not be swapped out. + + +5-2-2. General Usage + +"memory.high" is the main mechanism to control memory usage. +Over-committing on high limit (sum of high limits > available memory) +and letting global memory pressure to distribute memory according to +usage is a viable strategy. + +Because breach of the high limit doesn't trigger the OOM killer but +throttles the offending cgroup, a management agent has ample +opportunities to monitor and take appropriate actions such as granting +more memory or terminating the workload. + +Determining whether a cgroup has enough memory is not trivial as +memory usage doesn't indicate whether the workload can benefit from +more memory. For example, a workload which writes data received from +network to a file can use all available memory but can also operate as +performant with a small amount of memory. A measure of memory +pressure - how much the workload is being impacted due to lack of +memory - is necessary to determine whether a workload needs more +memory; unfortunately, memory pressure monitoring mechanism isn't +implemented yet. + + +5-2-3. Memory Ownership + +A memory area is charged to the cgroup which instantiated it and stays +charged to the cgroup until the area is released. Migrating a process +to a different cgroup doesn't move the memory usages that it +instantiated while in the previous cgroup to the new cgroup. + +A memory area may be used by processes belonging to different cgroups. +To which cgroup the area will be charged is in-deterministic; however, +over time, the memory area is likely to end up in a cgroup which has +enough memory allowance to avoid high reclaim pressure. + +If a cgroup sweeps a considerable amount of memory which is expected +to be accessed repeatedly by other cgroups, it may make sense to use +POSIX_FADV_DONTNEED to relinquish the ownership of memory areas +belonging to the affected files to ensure correct memory ownership. + + +5-3. IO + +The "io" controller regulates the distribution of IO resources. This +controller implements both weight based and absolute bandwidth or IOPS +limit distribution; however, weight based distribution is available +only if cfq-iosched is in use and neither scheme is available for +blk-mq devices. + + +5-3-1. IO Interface Files + + io.stat + + A read-only nested-keyed file which exists on non-root + cgroups. + + Lines are keyed by $MAJ:$MIN device numbers and not ordered. + The following nested keys are defined. + + rbytes Bytes read + wbytes Bytes written + rios Number of read IOs + wios Number of write IOs + + An example read output follows. + + 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 + 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 + + io.weight + + A read-write flat-keyed file which exists on non-root cgroups. + The default is "default 100". + + The first line is the default weight applied to devices + without specific override. The rest are overrides keyed by + $MAJ:$MIN device numbers and not ordered. The weights are in + the range [1, 10000] and specifies the relative amount IO time + the cgroup can use in relation to its siblings. + + The default weight can be updated by writing either "default + $WEIGHT" or simply "$WEIGHT". Overrides can be set by writing + "$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default". + + An example read output follows. + + default 100 + 8:16 200 + 8:0 50 + + io.max + + A read-write nested-keyed file which exists on non-root + cgroups. + + BPS and IOPS based IO limit. Lines are keyed by $MAJ:$MIN + device numbers and not ordered. The following nested keys are + defined. + + rbps Max read bytes per second + wbps Max write bytes per second + riops Max read IO operations per second + wiops Max write IO operations per second + + When writing, any number of nested key-value pairs can be + specified in any order. "max" can be specified as the value + to remove a specific limit. If the same key is specified + multiple times, the outcome is undefined. + + BPS and IOPS are measured in each IO direction and IOs are + delayed if limit is reached. Temporary bursts are allowed. + + Setting read limit at 2M BPS and write at 120 IOPS for 8:16. + + echo "8:16 rbps=2097152 wiops=120" > io.max + + Reading returns the following. + + 8:16 rbps=2097152 wbps=max riops=max wiops=120 + + Write IOPS limit can be removed by writing the following. + + echo "8:16 wiops=max" > io.max + + Reading now returns the following. + + 8:16 rbps=2097152 wbps=max riops=max wiops=max + + +5-3-2. Writeback + +Page cache is dirtied through buffered writes and shared mmaps and +written asynchronously to the backing filesystem by the writeback +mechanism. Writeback sits between the memory and IO domains and +regulates the proportion of dirty memory by balancing dirtying and +write IOs. + +The io controller, in conjunction with the memory controller, +implements control of page cache writeback IOs. The memory controller +defines the memory domain that dirty memory ratio is calculated and +maintained for and the io controller defines the io domain which +writes out dirty pages for the memory domain. Both system-wide and +per-cgroup dirty memory states are examined and the more restrictive +of the two is enforced. + +cgroup writeback requires explicit support from the underlying +filesystem. Currently, cgroup writeback is implemented on ext2, ext4 +and btrfs. On other filesystems, all writeback IOs are attributed to +the root cgroup. + +There are inherent differences in memory and writeback management +which affects how cgroup ownership is tracked. Memory is tracked per +page while writeback per inode. For the purpose of writeback, an +inode is assigned to a cgroup and all IO requests to write dirty pages +from the inode are attributed to that cgroup. + +As cgroup ownership for memory is tracked per page, there can be pages +which are associated with different cgroups than the one the inode is +associated with. These are called foreign pages. The writeback +constantly keeps track of foreign pages and, if a particular foreign +cgroup becomes the majority over a certain period of time, switches +the ownership of the inode to that cgroup. + +While this model is enough for most use cases where a given inode is +mostly dirtied by a single cgroup even when the main writing cgroup +changes over time, use cases where multiple cgroups write to a single +inode simultaneously are not supported well. In such circumstances, a +significant portion of IOs are likely to be attributed incorrectly. +As memory controller assigns page ownership on the first use and +doesn't update it until the page is released, even if writeback +strictly follows page ownership, multiple cgroups dirtying overlapping +areas wouldn't work as expected. It's recommended to avoid such usage +patterns. + +The sysctl knobs which affect writeback behavior are applied to cgroup +writeback as follows. + + vm.dirty_background_ratio + vm.dirty_ratio + + These ratios apply the same to cgroup writeback with the + amount of available memory capped by limits imposed by the + memory controller and system-wide clean memory. + + vm.dirty_background_bytes + vm.dirty_bytes + + For cgroup writeback, this is calculated into ratio against + total available memory and applied the same way as + vm.dirty[_background]_ratio. + + +P. Information on Kernel Programming + +This section contains kernel programming information in the areas +where interacting with cgroup is necessary. cgroup core and +controllers are not covered. + + +P-1. Filesystem Support for Writeback + +A filesystem can support cgroup writeback by updating +address_space_operations->writepage[s]() to annotate bio's using the +following two functions. + + wbc_init_bio(@wbc, @bio) + + Should be called for each bio carrying writeback data and + associates the bio with the inode's owner cgroup. Can be + called anytime between bio allocation and submission. + + wbc_account_io(@wbc, @page, @bytes) + + Should be called for each data segment being written out. + While this function doesn't care exactly when it's called + during the writeback session, it's the easiest and most + natural to call it as data segments are added to a bio. + +With writeback bio's annotated, cgroup support can be enabled per +super_block by setting SB_I_CGROUPWB in ->s_iflags. This allows for +selective disabling of cgroup writeback support which is helpful when +certain filesystem features, e.g. journaled data mode, are +incompatible. + +wbc_init_bio() binds the specified bio to its cgroup. Depending on +the configuration, the bio may be executed at a lower priority and if +the writeback session is holding shared resources, e.g. a journal +entry, may lead to priority inversion. There is no one easy solution +for the problem. Filesystems can try to work around specific problem +cases by skipping wbc_init_bio() or using bio_associate_blkcg() +directly. + + +D. Deprecated v1 Core Features + +- Multiple hierarchies including named ones are not supported. + +- All mount options and remounting are not supported. + +- The "tasks" file is removed and "cgroup.procs" is not sorted. + +- "cgroup.clone_children" is removed. + +- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file + at the root instead. + + +R. Issues with v1 and Rationales for v2 + +R-1. Multiple Hierarchies + +cgroup v1 allowed an arbitrary number of hierarchies and each +hierarchy could host any number of controllers. While this seemed to +provide a high level of flexibility, it wasn't useful in practice. + +For example, as there is only one instance of each controller, utility +type controllers such as freezer which can be useful in all +hierarchies could only be used in one. The issue is exacerbated by +the fact that controllers couldn't be moved to another hierarchy once +hierarchies were populated. Another issue was that all controllers +bound to a hierarchy were forced to have exactly the same view of the +hierarchy. It wasn't possible to vary the granularity depending on +the specific controller. + +In practice, these issues heavily limited which controllers could be +put on the same hierarchy and most configurations resorted to putting +each controller on its own hierarchy. Only closely related ones, such +as the cpu and cpuacct controllers, made sense to be put on the same +hierarchy. This often meant that userland ended up managing multiple +similar hierarchies repeating the same steps on each hierarchy +whenever a hierarchy management operation was necessary. + +Furthermore, support for multiple hierarchies came at a steep cost. +It greatly complicated cgroup core implementation but more importantly +the support for multiple hierarchies restricted how cgroup could be +used in general and what controllers was able to do. + +There was no limit on how many hierarchies there might be, which meant +that a thread's cgroup membership couldn't be described in finite +length. The key might contain any number of entries and was unlimited +in length, which made it highly awkward to manipulate and led to +addition of controllers which existed only to identify membership, +which in turn exacerbated the original problem of proliferating number +of hierarchies. + +Also, as a controller couldn't have any expectation regarding the +topologies of hierarchies other controllers might be on, each +controller had to assume that all other controllers were attached to +completely orthogonal hierarchies. This made it impossible, or at +least very cumbersome, for controllers to cooperate with each other. + +In most use cases, putting controllers on hierarchies which are +completely orthogonal to each other isn't necessary. What usually is +called for is the ability to have differing levels of granularity +depending on the specific controller. In other words, hierarchy may +be collapsed from leaf towards root when viewed from specific +controllers. For example, a given configuration might not care about +how memory is distributed beyond a certain level while still wanting +to control how CPU cycles are distributed. + + +R-2. Thread Granularity + +cgroup v1 allowed threads of a process to belong to different cgroups. +This didn't make sense for some controllers and those controllers +ended up implementing different ways to ignore such situations but +much more importantly it blurred the line between API exposed to +individual applications and system management interface. + +Generally, in-process knowledge is available only to the process +itself; thus, unlike service-level organization of processes, +categorizing threads of a process requires active participation from +the application which owns the target process. + +cgroup v1 had an ambiguously defined delegation model which got abused +in combination with thread granularity. cgroups were delegated to +individual applications so that they can create and manage their own +sub-hierarchies and control resource distributions along them. This +effectively raised cgroup to the status of a syscall-like API exposed +to lay programs. + +First of all, cgroup has a fundamentally inadequate interface to be +exposed this way. For a process to access its own knobs, it has to +extract the path on the target hierarchy from /proc/self/cgroup, +construct the path by appending the name of the knob to the path, open +and then read and/or write to it. This is not only extremely clunky +and unusual but also inherently racy. There is no conventional way to +define transaction across the required steps and nothing can guarantee +that the process would actually be operating on its own sub-hierarchy. + +cgroup controllers implemented a number of knobs which would never be +accepted as public APIs because they were just adding control knobs to +system-management pseudo filesystem. cgroup ended up with interface +knobs which were not properly abstracted or refined and directly +revealed kernel internal details. These knobs got exposed to +individual applications through the ill-defined delegation mechanism +effectively abusing cgroup as a shortcut to implementing public APIs +without going through the required scrutiny. + +This was painful for both userland and kernel. Userland ended up with +misbehaving and poorly abstracted interfaces and kernel exposing and +locked into constructs inadvertently. + + +R-3. Competition Between Inner Nodes and Threads + +cgroup v1 allowed threads to be in any cgroups which created an +interesting problem where threads belonging to a parent cgroup and its +children cgroups competed for resources. This was nasty as two +different types of entities competed and there was no obvious way to +settle it. Different controllers did different things. + +The cpu controller considered threads and cgroups as equivalents and +mapped nice levels to cgroup weights. This worked for some cases but +fell flat when children wanted to be allocated specific ratios of CPU +cycles and the number of internal threads fluctuated - the ratios +constantly changed as the number of competing entities fluctuated. +There also were other issues. The mapping from nice level to weight +wasn't obvious or universal, and there were various other knobs which +simply weren't available for threads. + +The io controller implicitly created a hidden leaf node for each +cgroup to host the threads. The hidden leaf had its own copies of all +the knobs with "leaf_" prefixed. While this allowed equivalent +control over internal threads, it was with serious drawbacks. It +always added an extra layer of nesting which wouldn't be necessary +otherwise, made the interface messy and significantly complicated the +implementation. + +The memory controller didn't have a way to control what happened +between internal tasks and child cgroups and the behavior was not +clearly defined. There were attempts to add ad-hoc behaviors and +knobs to tailor the behavior to specific workloads which would have +led to problems extremely difficult to resolve in the long term. + +Multiple controllers struggled with internal tasks and came up with +different ways to deal with it; unfortunately, all the approaches were +severely flawed and, furthermore, the widely different behaviors +made cgroup as a whole highly inconsistent. + +This clearly is a problem which needs to be addressed from cgroup core +in a uniform way. + + +R-4. Other Interface Issues + +cgroup v1 grew without oversight and developed a large number of +idiosyncrasies and inconsistencies. One issue on the cgroup core side +was how an empty cgroup was notified - a userland helper binary was +forked and executed for each event. The event delivery wasn't +recursive or delegatable. The limitations of the mechanism also led +to in-kernel event delivery filtering mechanism further complicating +the interface. + +Controller interfaces were problematic too. An extreme example is +controllers completely ignoring hierarchical organization and treating +all cgroups as if they were all located directly under the root +cgroup. Some controllers exposed a large amount of inconsistent +implementation details to userland. + +There also was no consistency across controllers. When a new cgroup +was created, some controllers defaulted to not imposing extra +restrictions while others disallowed any resource usage until +explicitly configured. Configuration knobs for the same type of +control used widely differing naming schemes and formats. Statistics +and information knobs were named arbitrarily and used different +formats and units even in the same controller. + +cgroup v2 establishes common conventions where appropriate and updates +controllers so that they expose minimal and consistent interfaces. + + +R-5. Controller Issues and Remedies + +R-5-1. Memory + +The original lower boundary, the soft limit, is defined as a limit +that is per default unset. As a result, the set of cgroups that +global reclaim prefers is opt-in, rather than opt-out. The costs for +optimizing these mostly negative lookups are so high that the +implementation, despite its enormous size, does not even provide the +basic desirable behavior. First off, the soft limit has no +hierarchical meaning. All configured groups are organized in a global +rbtree and treated like equal peers, regardless where they are located +in the hierarchy. This makes subtree delegation impossible. Second, +the soft limit reclaim pass is so aggressive that it not just +introduces high allocation latencies into the system, but also impacts +system performance due to overreclaim, to the point where the feature +becomes self-defeating. + +The memory.low boundary on the other hand is a top-down allocated +reserve. A cgroup enjoys reclaim protection when it and all its +ancestors are below their low boundaries, which makes delegation of +subtrees possible. Secondly, new cgroups have no reserve per default +and in the common case most cgroups are eligible for the preferred +reclaim pass. This allows the new low boundary to be efficiently +implemented with just a minor addition to the generic reclaim code, +without the need for out-of-band data structures and reclaim passes. +Because the generic reclaim code considers all cgroups except for the +ones running low in the preferred first reclaim pass, overreclaim of +individual groups is eliminated as well, resulting in much better +overall workload performance. + +The original high boundary, the hard limit, is defined as a strict +limit that can not budge, even if the OOM killer has to be called. +But this generally goes against the goal of making the most out of the +available memory. The memory consumption of workloads varies during +runtime, and that requires users to overcommit. But doing that with a +strict upper limit requires either a fairly accurate prediction of the +working set size or adding slack to the limit. Since working set size +estimation is hard and error prone, and getting it wrong results in +OOM kills, most users tend to err on the side of a looser limit and +end up wasting precious resources. + +The memory.high boundary on the other hand can be set much more +conservatively. When hit, it throttles allocations by forcing them +into direct reclaim to work off the excess, but it never invokes the +OOM killer. As a result, a high boundary that is chosen too +aggressively will not terminate the processes, but instead it will +lead to gradual performance degradation. The user can monitor this +and make corrections until the minimal memory footprint that still +gives acceptable performance is found. + +In extreme cases, with many concurrent allocations and a complete +breakdown of reclaim progress within the group, the high boundary can +be exceeded. But even then it's mostly better to satisfy the +allocation from the slack available in other groups or the rest of the +system than killing the group. Otherwise, memory.max is there to +limit this type of spillover and ultimately contain buggy or even +malicious applications. + +The combined memory+swap accounting and limiting is replaced by real +control over swap space. + +The main argument for a combined memory+swap facility in the original +cgroup design was that global or parental pressure would always be +able to swap all anonymous memory of a child group, regardless of the +child's own (possibly untrusted) configuration. However, untrusted +groups can sabotage swapping by other means - such as referencing its +anonymous memory in a tight loop - and an admin can not assume full +swappability when overcommitting untrusted jobs. + +For trusted jobs, on the other hand, a combined counter is not an +intuitive userspace interface, and it flies in the face of the idea +that cgroup controllers should account and limit specific physical +resources. Swap space is a resource like all others in the system, +and that's why unified hierarchy allows distributing it separately. diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX deleted file mode 100644 index 3f5a40f57..000000000 --- a/Documentation/cgroups/00-INDEX +++ /dev/null @@ -1,30 +0,0 @@ -00-INDEX - - this file -blkio-controller.txt - - Description for Block IO Controller, implementation and usage details. -cgroups.txt - - Control Groups definition, implementation details, examples and API. -cpuacct.txt - - CPU Accounting Controller; account CPU usage for groups of tasks. -cpusets.txt - - documents the cpusets feature; assign CPUs and Mem to a set of tasks. -devices.txt - - Device Whitelist Controller; description, interface and security. -freezer-subsystem.txt - - checkpointing; rationale to not use signals, interface. -hugetlb.txt - - HugeTLB Controller implementation and usage details. -memcg_test.txt - - Memory Resource Controller; implementation details. -memory.txt - - Memory Resource Controller; design, accounting, interface, testing. -net_cls.txt - - Network classifier cgroups details and usages. -net_prio.txt - - Network priority cgroups details and usages. -pids.txt - - Process number cgroups details and usages. -resource_counter.txt - - Resource Counter API. -unified-hierarchy.txt - - Description the new/next cgroup interface. diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt deleted file mode 100644 index 52fa9f353..000000000 --- a/Documentation/cgroups/blkio-controller.txt +++ /dev/null @@ -1,455 +0,0 @@ - Block IO Controller - =================== -Overview -======== -cgroup subsys "blkio" implements the block io controller. There seems to be -a need of various kinds of IO control policies (like proportional BW, max BW) -both at leaf nodes as well as at intermediate nodes in a storage hierarchy. -Plan is to use the same cgroup based management interface for blkio controller -and based on user options switch IO policies in the background. - -Currently two IO control policies are implemented. First one is proportional -weight time based division of disk policy. It is implemented in CFQ. Hence -this policy takes effect only on leaf nodes when CFQ is being used. The second -one is throttling policy which can be used to specify upper IO rate limits -on devices. This policy is implemented in generic block layer and can be -used on leaf nodes as well as higher level logical devices like device mapper. - -HOWTO -===== -Proportional Weight division of bandwidth ------------------------------------------ -You can do a very simple testing of running two dd threads in two different -cgroups. Here is what you can do. - -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable group scheduling in CFQ - CONFIG_CFQ_GROUP_IOSCHED=y - -- Compile and boot into kernel and mount IO controller (blkio); see - cgroups.txt, Why are cgroups needed?. - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/blkio - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Create two cgroups - mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 - -- Set weights of group test1 and test2 - echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight - echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight - -- Create two same size files (say 512MB each) on same disk (file1, file2) and - launch two dd threads in different cgroup to read those files. - - sync - echo 3 > /proc/sys/vm/drop_caches - - dd if=/mnt/sdb/zerofile1 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test1/tasks - cat /sys/fs/cgroup/blkio/test1/tasks - - dd if=/mnt/sdb/zerofile2 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test2/tasks - cat /sys/fs/cgroup/blkio/test2/tasks - -- At macro level, first dd should finish first. To get more precise data, keep - on looking at (with the help of script), at blkio.disk_time and - blkio.disk_sectors files of both test1 and test2 groups. This will tell how - much disk time (in milliseconds), each group got and how many sectors each - group dispatched to the disk. We provide fairness in terms of disk time, so - ideally io.disk_time of cgroups should be in proportion to the weight. - -Throttling/Upper Limit policy ------------------------------ -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable throttling in block layer - CONFIG_BLK_DEV_THROTTLING=y - -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?) - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ". - - echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. - -- Run dd to read a file and see if rate is throttled to 1MB/s or not. - - # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 - # iflag=direct - 1024+0 records in - 1024+0 records out - 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - - Limits for writes can be put using blkio.throttle.write_bps_device file. - -Hierarchical Cgroups -==================== - -Both CFQ and throttling implement hierarchy support; however, -throttling's hierarchy support is enabled iff "sane_behavior" is -enabled from cgroup side, which currently is a development option and -not publicly available. - -If somebody created a hierarchy like as follows. - - root - / \ - test1 test2 - | - test3 - -CFQ by default and throttling with "sane_behavior" will handle the -hierarchy correctly. For details on CFQ hierarchy support, refer to -Documentation/block/cfq-iosched.txt. For throttling, all limits apply -to the whole subtree while all statistics are local to the IOs -directly generated by tasks in that cgroup. - -Throttling without "sane_behavior" enabled from cgroup side will -practically treat all groups at same level as if it looks like the -following. - - pivot - / / \ \ - root test1 test2 test3 - -Various user visible config options -=================================== -CONFIG_BLK_CGROUP - - Block IO controller. - -CONFIG_DEBUG_BLK_CGROUP - - Debug help. Right now some additional stats file show up in cgroup - if this option is enabled. - -CONFIG_CFQ_GROUP_IOSCHED - - Enables group scheduling in CFQ. Currently only 1 level of group - creation is allowed. - -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. - -Details of cgroup files -======================= -Proportional weight policy files --------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. - -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. - - Following is the format. - - # echo dev_maj:dev_minor weight > blkio.weight_device - Configure weight=300 on /dev/sdb (8:16) in this cgroup - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - - Configure weight=500 on /dev/sda (8:0) in this cgroup - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:0 500 - 8:16 300 - - Remove specific weight for /dev/sda in this cgroup - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - -- blkio.leaf_weight[_device] - - Equivalents of blkio.weight[_device] for the purpose of - deciding how much weight tasks in the given cgroup has while - competing with the cgroup's child cgroups. For details, - please refer to Documentation/block/cfq-iosched.txt. - -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First - two fields specify the major and minor number of the device and - third field specifies the disk time allocated to group in - milliseconds. - -- blkio.sectors - - number of sectors transferred to/from disk by the group. First - two fields specify the major and minor number of the device and - third field specifies the number of sectors transferred by the - group to/from the device. - -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.io_service_time - - Total amount of time between request dispatch and request completion - for the IOs done by this cgroup. This is in nanoseconds to make it - meaningful for flash devices too. For devices with queue depth of 1, - this time represents the actual service time. When queue_depth > 1, - that is no longer true as requests may be served out of order. This - may cause the service time for a given IO to include the service time - of multiple IOs when served out of order which may result in total - io_service_time > actual time elapsed. This time is further divided by - the type of operation - read or write, sync or async. First two fields - specify the major and minor number of the device, third field - specifies the operation type and the fourth field specifies the - io_service_time in ns. - -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the - scheduler queues for service. This can be greater than the total time - elapsed since it is cumulative io_wait_time for all IOs. It is not a - measure of total time the cgroup spent waiting but rather a measure of - the wait_time for its individual IOs. For devices with queue_depth > 1 - this metric does not include the time spent waiting for service once - the IO is dispatched to the device but till it actually gets serviced - (there might be a time lag here due to re-ordering of requests by the - device). This is in nanoseconds to make it meaningful for flash - devices too. This time is further divided by the type of operation - - read or write, sync or async. First two fields specify the major and - minor number of the device, third field specifies the operation type - and the fourth field specifies the io_wait_time in ns. - -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.io_queued - - Total number of requests queued up at any given instant for this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - The average queue size for this cgroup over the entire time of this - cgroup's existence. Queue size samples are taken each time one of the - queues of this cgroup gets a timeslice. - -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time the cgroup had to wait since it became busy - (i.e., went from 0 to 1 request queued) to get a timeslice for one of - its queues. This is different from the io_wait_time which is the - cumulative total of the amount of time spent by each IO in that cgroup - waiting in the scheduler queue. This is in nanoseconds. If this is - read when the cgroup is in a waiting (for timeslice) state, the stat - will only report the group_wait_time accumulated till the last time it - got a timeslice and will not include the current delta. - -- blkio.empty_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time a cgroup spends without any pending - requests when not being served, i.e., it does not include any time - spent idling for one of the queues of the cgroup. This is in - nanoseconds. If this is read when the cgroup is in an empty state, - the stat will only report the empty_time accumulated till the last - time it had a pending request and will not include the current delta. - -- blkio.idle_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time spent by the IO scheduler idling for a - given cgroup in anticipation of a better request than the existing ones - from other queues/cgroups. This is in nanoseconds. If this is read - when the cgroup is in an idling state, the stat will only report the - idle_time accumulated till the last idle period and will not include - the current delta. - -- blkio.dequeue - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This - gives the statistics about how many a times a group was dequeued - from service tree of the device. First two fields specify the major - and minor number of the device and third field specifies the number - of times a group was dequeued from a particular device. - -- blkio.*_recursive - - Recursive version of various stats. These files show the - same information as their non-recursive counterparts but - include stats from all the descendant cgroups. - -Throttling/Upper limit policy files ------------------------------------ -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.read_bps_device - -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.write_bps_device - -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in IO per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.read_iops_device - -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in io per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.write_iops_device - -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. - -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -Common files among various policies ------------------------------------ -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats - for that cgroup. - -CFQ sysfs tunable -================= -/sys/block//queue/iosched/slice_idle ------------------------------------------- -On a faster hardware CFQ can be slow, especially with sequential workload. -This happens because CFQ idles on a single queue and single queue might not -drive deeper request queue depths to keep the storage busy. In such scenarios -one can try setting slice_idle=0 and that would switch CFQ to IOPS -(IO operations per second) mode on NCQ supporting hardware. - -That means CFQ will not idle between cfq queues of a cfq group and hence be -able to driver higher queue depth and achieve better throughput. That also -means that cfq provides fairness among groups in terms of IOPS and not in -terms of disk time. - -/sys/block//queue/iosched/group_idle ------------------------------------------- -If one disables idling on individual cfq queues and cfq service trees by -setting slice_idle=0, group_idle kicks in. That means CFQ will still idle -on the group in an attempt to provide fairness among groups. - -By default group_idle is same as slice_idle and does not do anything if -slice_idle is enabled. - -One can experience an overall throughput drop if you have created multiple -groups and put applications in that group which are not driving enough -IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle -on individual groups and throughput should improve. - -Writeback -========= - -Page cache is dirtied through buffered writes and shared mmaps and -written asynchronously to the backing filesystem by the writeback -mechanism. Writeback sits between the memory and IO domains and -regulates the proportion of dirty memory by balancing dirtying and -write IOs. - -On traditional cgroup hierarchies, relationships between different -controllers cannot be established making it impossible for writeback -to operate accounting for cgroup resource restrictions and all -writeback IOs are attributed to the root cgroup. - -If both the blkio and memory controllers are used on the v2 hierarchy -and the filesystem supports cgroup writeback, writeback operations -correctly follow the resource restrictions imposed by both memory and -blkio controllers. - -Writeback examines both system-wide and per-cgroup dirty memory status -and enforces the more restrictive of the two. Also, writeback control -parameters which are absolute values - vm.dirty_bytes and -vm.dirty_background_bytes - are distributed across cgroups according -to their current writeback bandwidth. - -There's a peculiarity stemming from the discrepancy in ownership -granularity between memory controller and writeback. While memory -controller tracks ownership per page, writeback operates on inode -basis. cgroup writeback bridges the gap by tracking ownership by -inode but migrating ownership if too many foreign pages, pages which -don't match the current inode ownership, have been encountered while -writing back the inode. - -This is a conscious design choice as writeback operations are -inherently tied to inodes making strictly following page ownership -complicated and inefficient. The only use case which suffers from -this compromise is multiple cgroups concurrently dirtying disjoint -regions of the same inode, which is an unlikely use case and decided -to be unsupported. Note that as memory controller assigns page -ownership on the first use and doesn't update it until the page is -released, even if cgroup writeback strictly follows page ownership, -multiple cgroups dirtying overlapping areas wouldn't work as expected. -In general, write-sharing an inode across multiple cgroups is not well -supported. - -Filesystem support for cgroup writeback ---------------------------------------- - -A filesystem can make writeback IOs cgroup-aware by updating -address_space_operations->writepage[s]() to annotate bio's using the -following two functions. - -* wbc_init_bio(@wbc, @bio) - - Should be called for each bio carrying writeback data and associates - the bio with the inode's owner cgroup. Can be called anytime - between bio allocation and submission. - -* wbc_account_io(@wbc, @page, @bytes) - - Should be called for each data segment being written out. While - this function doesn't care exactly when it's called during the - writeback session, it's the easiest and most natural to call it as - data segments are added to a bio. - -With writeback bio's annotated, cgroup support can be enabled per -super_block by setting MS_CGROUPWB in ->s_flags. This allows for -selective disabling of cgroup writeback support which is helpful when -certain filesystem features, e.g. journaled data mode, are -incompatible. - -wbc_init_bio() binds the specified bio to its cgroup. Depending on -the configuration, the bio may be executed at a lower priority and if -the writeback session is holding shared resources, e.g. a journal -entry, may lead to priority inversion. There is no one easy solution -for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_blkcg() -directly. diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt deleted file mode 100644 index c6256ae98..000000000 --- a/Documentation/cgroups/cgroups.txt +++ /dev/null @@ -1,682 +0,0 @@ - CGROUPS - ------- - -Written by Paul Menage based on -Documentation/cgroups/cpusets.txt - -Original copyright statements from cpusets.txt: -Portions Copyright (C) 2004 BULL SA. -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter - -CONTENTS: -========= - -1. Control Groups - 1.1 What are cgroups ? - 1.2 Why are cgroups needed ? - 1.3 How are cgroups implemented ? - 1.4 What does notify_on_release do ? - 1.5 What does clone_children do ? - 1.6 How do I use cgroups ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Attaching processes - 2.3 Mounting hierarchies by name -3. Kernel API - 3.1 Overview - 3.2 Synchronization - 3.3 Subsystem API -4. Extended attributes usage -5. Questions - -1. Control Groups -================= - -1.1 What are cgroups ? ----------------------- - -Control Groups provide a mechanism for aggregating/partitioning sets of -tasks, and all their future children, into hierarchical groups with -specialized behaviour. - -Definitions: - -A *cgroup* associates a set of tasks with a set of parameters for one -or more subsystems. - -A *subsystem* is a module that makes use of the task grouping -facilities provided by cgroups to treat groups of tasks in -particular ways. A subsystem is typically a "resource controller" that -schedules a resource or applies per-cgroup limits, but it may be -anything that wants to act on a group of processes, e.g. a -virtualization subsystem. - -A *hierarchy* is a set of cgroups arranged in a tree, such that -every task in the system is in exactly one of the cgroups in the -hierarchy, and a set of subsystems; each subsystem has system-specific -state attached to each cgroup in the hierarchy. Each hierarchy has -an instance of the cgroup virtual filesystem associated with it. - -At any one time there may be multiple active hierarchies of task -cgroups. Each hierarchy is a partition of all tasks in the system. - -User-level code may create and destroy cgroups by name in an -instance of the cgroup virtual file system, specify and query to -which cgroup a task is assigned, and list the task PIDs assigned to -a cgroup. Those creations and assignments only affect the hierarchy -associated with that instance of the cgroup file system. - -On their own, the only use for cgroups is for simple job -tracking. The intention is that other subsystems hook into the generic -cgroup support to provide new attributes for cgroups, such as -accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow -you to associate a set of CPUs and a set of memory nodes with the -tasks in each cgroup. - -1.2 Why are cgroups needed ? ----------------------------- - -There are multiple efforts to provide process aggregations in the -Linux kernel, mainly for resource-tracking purposes. Such efforts -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server -namespaces. These all require the basic notion of a -grouping/partitioning of processes, with newly forked processes ending -up in the same group (cgroup) as their parent process. - -The kernel cgroup patch provides the minimum essential kernel -mechanisms required to efficiently implement such groups. It has -minimal impact on the system fast paths, and provides hooks for -specific subsystems such as cpusets to provide additional behaviour as -desired. - -Multiple hierarchy support is provided to allow for situations where -the division of tasks into cgroups is distinctly different for -different subsystems - having parallel hierarchies allows each -hierarchy to be a natural division of tasks, without having to handle -complex combinations of tasks that would be present if several -unrelated subsystems needed to be forced into the same tree of -cgroups. - -At one extreme, each resource controller or subsystem could be in a -separate hierarchy; at the other extreme, all subsystems -would be attached to the same hierarchy. - -As an example of a scenario (originally proposed by vatsa@in.ibm.com) -that can benefit from multiple hierarchies, consider a large -university server with various users - students, professors, system -tasks etc. The resource planning for this server could be along the -following lines: - - CPU : "Top cpuset" - / \ - CPUSet1 CPUSet2 - | | - (Professors) (Students) - - In addition (system tasks) are attached to topcpuset (so - that they can run anywhere) with a limit of 20% - - Memory : Professors (50%), Students (30%), system (20%) - - Disk : Professors (50%), Students (30%), system (20%) - - Network : WWW browsing (20%), Network File System (60%), others (20%) - / \ - Professors (15%) students (5%) - -Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes -into the NFS network class. - -At the same time Firefox/Lynx will share an appropriate CPU/Memory class -depending on who launched it (prof/student). - -With the ability to classify tasks differently for different resources -(by putting those resource subsystems in different hierarchies), -the admin can easily set up a script which receives exec notifications -and depending on who is launching the browser he can - - # echo browser_pid > /sys/fs/cgroup///tasks - -With only a single hierarchy, he now would potentially have to create -a separate cgroup for every browser launched and associate it with -appropriate network and other resource class. This may lead to -proliferation of such cgroups. - -Also let's say that the administrator would like to give enhanced network -access temporarily to a student's browser (since it is night and the user -wants to do online gaming :)) OR give one of the student's simulation -apps enhanced CPU power. - -With ability to write PIDs directly to resource classes, it's just a -matter of: - - # echo pid > /sys/fs/cgroup/network//tasks - (after some time) - # echo pid > /sys/fs/cgroup/network//tasks - -Without this ability, the administrator would have to split the cgroup into -multiple separate ones and then associate the new cgroups with the -new resource classes. - - - -1.3 How are cgroups implemented ? ---------------------------------- - -Control Groups extends the kernel as follows: - - - Each task in the system has a reference-counted pointer to a - css_set. - - - A css_set contains a set of reference-counted pointers to - cgroup_subsys_state objects, one for each cgroup subsystem - registered in the system. There is no direct link from a task to - the cgroup of which it's a member in each hierarchy, but this - can be determined by following pointers through the - cgroup_subsys_state objects. This is because accessing the - subsystem state is something that's expected to happen frequently - and in performance-critical code, whereas operations that require a - task's actual cgroup assignments (in particular, moving between - cgroups) are less common. A linked list runs through the cg_list - field of each task_struct using the css_set, anchored at - css_set->tasks. - - - A cgroup hierarchy filesystem can be mounted for browsing and - manipulation from user space. - - - You can list all the tasks (by PID) attached to any cgroup. - -The implementation of cgroups requires a few, simple hooks -into the rest of the kernel, none in performance-critical paths: - - - in init/main.c, to initialize the root cgroups and initial - css_set at system boot. - - - in fork and exit, to attach and detach a task from its css_set. - -In addition, a new file system of type "cgroup" may be mounted, to -enable browsing and modifying the cgroups presently known to the -kernel. When mounting a cgroup hierarchy, you may specify a -comma-separated list of subsystems to mount as the filesystem mount -options. By default, mounting the cgroup filesystem attempts to -mount a hierarchy containing all registered subsystems. - -If an active hierarchy with exactly the same set of subsystems already -exists, it will be reused for the new mount. If no existing hierarchy -matches, and any of the requested subsystems are in use in an existing -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy -is activated, associated with the requested subsystems. - -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. - -When a cgroup filesystem is unmounted, if there are any -child cgroups created below the top-level cgroup, that hierarchy -will remain active even though unmounted; if there are no -child cgroups then the hierarchy will be deactivated. - -No new system calls are added for cgroups - all support for -querying and modifying cgroups is via this cgroup file system. - -Each task under /proc has an added file named 'cgroup' displaying, -for each active hierarchy, the subsystem names and the cgroup name -as the path relative to the root of the cgroup file system. - -Each cgroup is represented by a directory in the cgroup file system -containing the following files describing that cgroup: - - - tasks: list of tasks (by PID) attached to that cgroup. This list - is not guaranteed to be sorted. Writing a thread ID into this file - moves the thread into this cgroup. - - cgroup.procs: list of thread group IDs in the cgroup. This list is - not guaranteed to be sorted or free of duplicate TGIDs, and userspace - should sort/uniquify the list if this property is required. - Writing a thread group ID into this file moves all threads in that - group into this cgroup. - - notify_on_release flag: run the release agent on exit? - - release_agent: the path to use for release notifications (this file - exists in the top cgroup only) - -Other subsystems such as cpusets may add additional files in each -cgroup dir. - -New cgroups are created using the mkdir system call or shell -command. The properties of a cgroup, such as its flags, are -modified by writing to the appropriate file in that cgroups -directory, as listed above. - -The named hierarchical structure of nested cgroups allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cgroup allows organizing the work load -on a system into related sets of tasks. A task may be re-attached to -any other cgroup, if allowed by the permissions on the necessary -cgroup file system directories. - -When a task is moved from one cgroup to another, it gets a new -css_set pointer - if there's an already existing css_set with the -desired collection of cgroups then that group is reused, otherwise a new -css_set is allocated. The appropriate existing css_set is located by -looking into a hash table. - -To allow access from a cgroup to the css_sets (and hence tasks) -that comprise it, a set of cg_cgroup_link objects form a lattice; -each cg_cgroup_link is linked into a list of cg_cgroup_links for -a single cgroup on its cgrp_link_list field, and a list of -cg_cgroup_links for a single css_set on its cg_link_list. - -Thus the set of tasks in a cgroup can be listed by iterating over -each css_set that references the cgroup, and sub-iterating over -each css_set's task set. - -The use of a Linux virtual file system (vfs) to represent the -cgroup hierarchy provides for a familiar permission and name space -for cgroups, with a minimum of additional kernel code. - -1.4 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cgroup, then -whenever the last task in the cgroup leaves (exits or attaches to -some other cgroup) and the last child cgroup of that cgroup -is removed, then the kernel runs the command specified by the contents -of the "release_agent" file in that hierarchy's root directory, -supplying the pathname (relative to the mount point of the cgroup -file system) of the abandoned cgroup. This enables automatic -removal of abandoned cgroups. The default value of -notify_on_release in the root cgroup at system boot is disabled -(0). The default value of other cgroups at creation is the current -value of their parents' notify_on_release settings. The default value of -a cgroup hierarchy's release_agent path is empty. - -1.5 What does clone_children do ? ---------------------------------- - -This flag only affects the cpuset controller. If the clone_children -flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its -configuration from the parent during initialization. - -1.6 How do I use cgroups ? --------------------------- - -To start a new job that is to be contained within a cgroup, using -the "cpuset" cgroup subsystem, the steps are something like: - - 1) mount -t tmpfs cgroup_root /sys/fs/cgroup - 2) mkdir /sys/fs/cgroup/cpuset - 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 4) Create the new cgroup by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 5) Start a task that will be the "founding father" of the new job. - 6) Attach that task to the new cgroup by writing its PID to the - /sys/fs/cgroup/cpuset tasks file for that cgroup. - 7) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cgroup -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cgroup: - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/cpuset - mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cgroup Charlie - # The next line should display '/Charlie' - cat /proc/self/cgroup - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using cgroups can be done through the cgroup -virtual filesystem. - -To mount a cgroup hierarchy with all available subsystems, type: -# mount -t cgroup xxx /sys/fs/cgroup - -The "xxx" is not interpreted by the cgroup code, but will appear in -/proc/mounts so may be any useful identifying string that you like. - -Note: Some subsystems do not work without some user input first. For instance, -if cpusets are enabled the user will have to populate the cpus and mems files -for each new cgroup created before that group can be used. - -As explained in section `1.2 Why are cgroups needed?' you should create -different hierarchies of cgroups for each single resource or group of -resources you want to control. Therefore, you should mount a tmpfs on -/sys/fs/cgroup and create directories for each cgroup resource or resource -group. - -# mount -t tmpfs cgroup_root /sys/fs/cgroup -# mkdir /sys/fs/cgroup/rg1 - -To mount a cgroup hierarchy with just the cpuset and memory -subsystems, type: -# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 - -While remounting cgroups is currently supported, it is not recommend -to use it. Remounting allows changing bound subsystems and -release_agent. Rebinding is hardly useful as it only works when the -hierarchy is empty and release_agent itself should be replaced with -conventional fsnotify. The support for remounting will be removed in -the future. - -To Specify a hierarchy's release_agent: -# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ - xxx /sys/fs/cgroup/rg1 - -Note that specifying 'release_agent' more than once will return failure. - -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. - -Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the -tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 -is the cgroup that holds the whole system. - -If you want to change the value of release_agent: -# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent - -It can also be changed via remount. - -If you want to create a new cgroup under /sys/fs/cgroup/rg1: -# cd /sys/fs/cgroup/rg1 -# mkdir my_cgroup - -Now you want to do something with this cgroup. -# cd my_cgroup - -In this directory you can find several files: -# ls -cgroup.procs notify_on_release tasks -(plus whatever files added by the attached subsystems) - -Now attach your shell to this cgroup: -# /bin/echo $$ > tasks - -You can also create cgroups inside your cgroup by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cgroup, just use rmdir: -# rmdir my_sub_cs - -This will fail if the cgroup is in use (has cgroups inside, or -has processes attached, or is held alive by other subsystem-specific -reference). - -2.2 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - -You can attach the current shell task by echoing 0: - -# echo 0 > tasks - -You can use the cgroup.procs file instead of the tasks file to move all -threads in a threadgroup at once. Echoing the PID of any task in a -threadgroup to cgroup.procs causes all tasks in that threadgroup to be -attached to the cgroup. Writing 0 to cgroup.procs moves all tasks -in the writing task's threadgroup. - -Note: Since every task is always a member of exactly one cgroup in each -mounted hierarchy, to remove a task from its current cgroup you must -move it into a new cgroup (possibly the root cgroup) by writing to the -new cgroup's tasks file. - -Note: Due to some restrictions enforced by some cgroup subsystems, moving -a process to another cgroup can fail. - -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - -3. Kernel API -============= - -3.1 Overview ------------- - -Each kernel subsystem that wants to hook into the generic cgroup -system needs to create a cgroup_subsys object. This contains -various methods, which are callbacks from the cgroup system, along -with a subsystem ID which will be assigned by the cgroup system. - -Other fields in the cgroup_subsys object include: - -- subsys_id: a unique array index for the subsystem, indicating which - entry in cgroup->subsys[] this subsystem should be managing. - -- name: should be initialized to a unique subsystem name. Should be - no longer than MAX_CGROUP_TYPE_NAMELEN. - -- early_init: indicate if the subsystem needs early initialization - at system boot. - -Each cgroup object created by the system has an array of pointers, -indexed by subsystem ID; this pointer is entirely managed by the -subsystem; the generic cgroup code will never touch this pointer. - -3.2 Synchronization -------------------- - -There is a global mutex, cgroup_mutex, used by the cgroup -system. This should be taken by anything that wants to modify a -cgroup. It may also be taken to prevent cgroups from being -modified, but more specific locks may be more appropriate in that -situation. - -See kernel/cgroup.c for more details. - -Subsystems can take/release the cgroup_mutex via the functions -cgroup_lock()/cgroup_unlock(). - -Accessing a task's cgroup pointer may be done in the following ways: -- while holding cgroup_mutex -- while holding the task's alloc_lock (via task_lock()) -- inside an rcu_read_lock() section via rcu_dereference() - -3.3 Subsystem API ------------------ - -Each subsystem should: - -- add an entry in linux/cgroup_subsys.h -- define a cgroup_subsys object called _subsys - -If a subsystem can be compiled as a module, it should also have in its -module initcall a call to cgroup_load_subsys(), and in its exitcall a -call to cgroup_unload_subsys(). It should also set its_subsys.module = -THIS_MODULE in its .c file. - -Each subsystem may export the following methods. The only mandatory -methods are css_alloc/free. Any others that are null are presumed to -be successful no-ops. - -struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called to allocate a subsystem state object for a cgroup. The -subsystem should allocate its subsystem state object for the passed -cgroup, returning a pointer to the new object on success or a -ERR_PTR() value. On success, the subsystem pointer should point to -a structure of type cgroup_subsys_state (typically embedded in a -larger subsystem-specific object), which will be initialized by the -cgroup system. Note that this will be called at initialization to -create the root subsystem state for this subsystem; this case can be -identified by the passed cgroup object having a NULL parent (since -it's the root of the hierarchy) and may be an appropriate place for -initialization code. - -int css_online(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called after @cgrp successfully completed all allocations and made -visible to cgroup_for_each_child/descendant_*() iterators. The -subsystem may choose to fail creation by returning -errno. This -callback can be used to implement reliable state sharing and -propagation along the hierarchy. See the comment on -cgroup_for_each_descendant_pre() for details. - -void css_offline(struct cgroup *cgrp); -(cgroup_mutex held by caller) - -This is the counterpart of css_online() and called iff css_online() -has succeeded on @cgrp. This signifies the beginning of the end of -@cgrp. @cgrp is being removed and the subsystem should start dropping -all references it's holding on @cgrp. When all references are dropped, -cgroup removal will proceed to the next step - css_free(). After this -callback, @cgrp should be considered dead to the subsystem. - -void css_free(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -The cgroup system is about to free @cgrp; the subsystem should free -its subsystem state object. By the time this method is called, @cgrp -is completely unused; @cgrp->parent is still valid. (Note - can also -be called for a newly-created cgroup if an error occurs after this -subsystem's create() method has been called for the new cgroup). - -int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called prior to moving one or more tasks into a cgroup; if the -subsystem returns an error, this will abort the attach operation. -@tset contains the tasks to be attached and is guaranteed to have at -least one task in it. - -If there are multiple tasks in the taskset, then: - - it's guaranteed that all are from the same thread group - - @tset contains all tasks from the thread group whether or not - they're switching cgroups - - the first task is the leader - -Each @tset entry also contains the task's old cgroup and tasks which -aren't switching cgroup can be skipped easily using the -cgroup_taskset_for_each() iterator. Note that this isn't called on a -fork. If this method returns 0 (success) then this should remain valid -while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. - -void css_reset(struct cgroup_subsys_state *css) -(cgroup_mutex held by caller) - -An optional operation which should restore @css's configuration to the -initial state. This is currently only used on the unified hierarchy -when a subsystem is disabled on a cgroup through -"cgroup.subtree_control" but should remain enabled because other -subsystems depend on it. cgroup core makes such a css invisible by -removing the associated interface files and invokes this callback so -that the hidden subsystem can return to the initial neutral state. -This prevents unexpected resource control from a hidden css and -ensures that the configuration is in the initial state when it is made -visible again later. - -void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called when a task attach operation has failed after can_attach() has succeeded. -A subsystem whose can_attach() has some side-effects should provide this -function, so that the subsystem can implement a rollback. If not, not necessary. -This will be called only about subsystems whose can_attach() operation have -succeeded. The parameters are identical to can_attach(). - -void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called after the task has been attached to the cgroup, to allow any -post-attachment activity that requires memory allocations or blocking. -The parameters are identical to can_attach(). - -void fork(struct task_struct *task) - -Called when a task is forked into a cgroup. - -void exit(struct task_struct *task) - -Called during task exit. - -void free(struct task_struct *task) - -Called when the task_struct is freed. - -void bind(struct cgroup *root) -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). - -4. Extended attribute usage -=========================== - -cgroup filesystem supports certain types of extended attributes in its -directories and files. The current supported types are: - - Trusted (XATTR_TRUSTED) - - Security (XATTR_SECURITY) - -Both require CAP_SYS_ADMIN capability to set. - -Like in tmpfs, the extended attributes in cgroup filesystem are stored -using kernel memory and it's advised to keep the usage at minimum. This -is the reason why user defined extended attributes are not supported, since -any user can do it and there's no limit in the value size. - -The current known users for this feature are SELinux to limit cgroup usage -in containers and systemd for assorted meta data like main PID in a cgroup -(systemd creates a cgroup per service). - -5. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cgroup file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE PID. - diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt deleted file mode 100644 index 9d73cc0ca..000000000 --- a/Documentation/cgroups/cpuacct.txt +++ /dev/null @@ -1,49 +0,0 @@ -CPU Accounting Controller -------------------------- - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -ocpuacct none /sys/fs/cgroup - -With the above step, the initial or the parent accounting group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. -/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained -by this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /sys/fs/cgroup. - -# cd /sys/fs/cgroup -# mkdir g1 -# echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/sys/fs/cgroup/cpuacct.usage also. - -cpuacct.stat file lists a few statistics which further divide the -CPU time obtained by the cgroup into user and system times. Currently -the following statistics are supported: - -user: Time spent by tasks of the cgroup in user mode. -system: Time spent by tasks of the cgroup in kernel mode. - -user and system are in USER_HZ unit. - -cpuacct controller uses percpu_counter interface to collect user and -system times. This has two side effects: - -- It is theoretically possible to see wrong values for user and system times. - This is because percpu_counter_read() on 32bit systems isn't safe - against concurrent writes. -- It is possible to see slightly outdated values for user and system times - due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt deleted file mode 100644 index fdf7dff3f..000000000 --- a/Documentation/cgroups/cpusets.txt +++ /dev/null @@ -1,839 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a task's current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroups/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that task's cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting task's mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that task's cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that task's cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendants) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that task's cpuset. - - in sched.c migrate_live_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that task's cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the task's cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpuset.cpus: list of CPUs in that cpuset - - cpuset.mems: list of Memory Nodes in that cpuset - - cpuset.memory_migrate flag: if set, move pages to cpusets nodes - - cpuset.cpu_exclusive flag: is cpu placement exclusive? - - cpuset.mem_exclusive flag: is memory placement exclusive? - - cpuset.mem_hardwall flag: is memory allocation hardwalled - - cpuset.memory_pressure: measure of how much paging pressure in cpuset - - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes - - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - - cpuset.sched_relax_domain_level: the searching range when migrating tasks - -In addition, only the root cpuset has the following file: - - cpuset.memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_mask using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendant, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned to them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'cpuset.memory_spread_page' and -'cpuset.memory_spread_slab'. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the task's NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the task's NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing task's memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PFA_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current task's mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched/core.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, including those -marked isolated using the kernel boot time "isolcpus=" argument. However, -the isolated CPUs will not participate in load balancing, and will not -have tasks running on them unless explicitly assigned. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"cpuset.sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendant cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside its cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "cpuset.sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -CPUs in "cpuset.isolcpus" were excluded from load balancing by the -isolcpus= kernel boot option, and will never be load balanced regardless -of the value of "cpuset.sched_load_balance" in any cpuset. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of struct cpumask) of CPUs, pairwise disjoint, that cover -all the CPUs that must be load balanced. - -The cpuset code builds a new such partition and passes it to the -scheduler sched domain setup code, to have the sched domains rebuilt -as necessary, whenever: - - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, - - or CPUs come or go from a cpuset with this flag enabled, - - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs - and with this flag enabled changes, - - or a cpuset with non-empty CPUs and with this flag enabled is removed, - - or a cpu is offlined/onlined. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (struct cpumask) in the -partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -every time. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searches all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'cpuset.sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - 4 : search nodes in a chunk of node [on NUMA system] - 5 : search system wide [on NUMA system] - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset -is disabled, then 'cpuset.sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not depends on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the task's cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its NUMA placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the task's -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a task's pid is written to another cpusets 'cpuset.tasks' file, then its -allowed CPU placement is changed immediately. If such a task had been -bound to some subset of its cpuset using the sched_setaffinity() call, -the task will be allowed to run on any CPU allowed in its new cpuset, -negating the effect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -and the processor placement is updated immediately. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'cpuset.mems' subsequently changes. -If the cpuset flag file 'cpuset.memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the task's new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'cpuset.memory_migrate' is set true, then if that cpuset's -'cpuset.mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'cpuset.mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the task's prior cpuset, or in the cpuset's -prior 'cpuset.mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current task's cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /sys/fs/cgroup/cpuset - 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /sys/fs/cgroup/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -There are ways to query or modify cpusets: - - via the cpuset file system directly, using the various cd, mkdir, echo, - cat, rmdir commands from the shell, or their equivalent from C. - - via the C library libcpuset. - - via the C library libcgroup. - (http://sourceforge.net/projects/libcg/) - - via the python application cset. - (http://code.google.com/p/cpuset/) - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset - -Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /sys/fs/cgroup/cpuset: -# cd /sys/fs/cgroup/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cgroup.clone_children cpuset.memory_pressure -cgroup.event_control cpuset.memory_spread_page -cgroup.procs cpuset.memory_spread_slab -cpuset.cpu_exclusive cpuset.mems -cpuset.cpus cpuset.sched_load_balance -cpuset.mem_exclusive cpuset.sched_relax_domain_level -cpuset.mem_hardwall notify_on_release -cpuset.memory_migrate tasks - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpuset.cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpuset.cpus - -Add some mems: -# /bin/echo 0-7 > cpuset.mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /sys/fs/cgroup/cpuset - -is equivalent to - -mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset -echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - -To add a CPU to a cpuset, write the new list of CPUs including the -CPU to be added. To add 6 to the above cpuset: - -# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 - -Similarly to remove a CPU from a cpuset, write the new list of CPUs -without the CPU to be removed. - -To remove all the CPUs: - -# /bin/echo "" > cpuset.cpus -> clear cpus list - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' -# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt deleted file mode 100644 index 3c1095ca0..000000000 --- a/Documentation/cgroups/devices.txt +++ /dev/null @@ -1,116 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /sys/fs/cgroup/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /sys/fs/cgroup/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendant of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. - -4. Hierarchy - -device cgroups maintain hierarchy by making sure a cgroup never has more -access permissions than its parent. Every time an entry is written to -a cgroup's devices.deny file, all its children will have that entry removed -from their whitelist and all the locally set whitelist entries will be -re-evaluated. In case one of the locally set whitelist entries would provide -more access than the cgroup's parent, it'll be removed from the whitelist. - -Example: - A - / \ - B - - group behavior exceptions - A allow "b 8:* rwm", "c 116:1 rw" - B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" - -If a device is denied in group A: - # echo "c 116:* r" > A/devices.deny -it'll propagate down and after revalidating B's entries, the whitelist entry -"c 116:2 rwm" will be removed: - - group whitelist entries denied devices - A all "b 8:* rwm", "c 116:* rw" - B "c 1:3 rwm", "b 3:* rwm" all the rest - -In case parent's exceptions change and local exceptions are not allowed -anymore, they'll be deleted. - -Notice that new whitelist entries will not be propagated: - A - / \ - B - - group whitelist entries denied devices - A "c 1:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -when adding "c *:3 rwm": - # echo "c *:3 rwm" >A/devices.allow - -the result: - group whitelist entries denied devices - A "c *:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -but now it'll be possible to add new entries to B: - # echo "c 2:3 rwm" >B/devices.allow - # echo "c 50:3 r" >B/devices.allow -or even - # echo "c *:3 rwm" >B/devices.allow - -Allowing or denying all by writing 'a' to devices.allow or devices.deny will -not be possible once the device cgroups has children. - -4.1 Hierarchy (internal implementation) - -device cgroups is implemented internally using a behavior (ALLOW, DENY) and a -list of exceptions. The internal state is controlled using the same user -interface to preserve compatibility with the previous whitelist-only -implementation. Removal or addition of exceptions that will reduce the access -to devices will be propagated down the hierarchy. -For every propagated exception, the effective rules will be re-evaluated based -on current parent's access rules. diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt deleted file mode 100644 index e831cb2b8..000000000 --- a/Documentation/cgroups/freezer-subsystem.txt +++ /dev/null @@ -1,123 +0,0 @@ -The cgroup freezer is useful to batch job management system which start -and stop sets of tasks in order to schedule the resources of a machine -according to the desires of a system administrator. This sort of program -is often used on HPC clusters to schedule access to the cluster as a -whole. The cgroup freezer uses cgroups to describe the set of tasks to -be started/stopped by the batch job management system. It also provides -a means to start and stop the tasks composing the job. - -The cgroup freezer will also be useful for checkpointing running groups -of tasks. The freezer allows the checkpoint code to obtain a consistent -image of the tasks by attempting to force the tasks in a cgroup into a -quiescent state. Once the tasks are quiescent another task can -walk /proc or invoke a kernel interface to gather information about the -quiesced tasks. Checkpointed tasks can be restarted later should a -recoverable error occur. This also allows the checkpointed tasks to be -migrated between nodes in a cluster by copying the gathered information -to another node and restarting the tasks there. - -Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping -and resuming tasks in userspace. Both of these signals are observable -from within the tasks we wish to freeze. While SIGSTOP cannot be caught, -blocked, or ignored it can be seen by waiting or ptracing parent tasks. -SIGCONT is especially unsuitable since it can be caught by the task. Any -programs designed to watch for SIGSTOP and SIGCONT could be broken by -attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can -demonstrate this problem using nested bash shells: - - $ echo $$ - 16644 - $ bash - $ echo $$ - 16690 - - From a second, unrelated bash shell: - $ kill -SIGSTOP 16690 - $ kill -SIGCONT 16690 - - - -This happens because bash can observe both signals and choose how it -responds to them. - -Another example of a program which catches and responds to these -signals is gdb. In fact any program designed to use ptrace is likely to -have a problem with this method of stopping and resuming tasks. - -In contrast, the cgroup freezer uses the kernel freezer code to -prevent the freeze/unfreeze cycle from becoming visible to the tasks -being frozen. This allows the bash example above and gdb to run as -expected. - -The cgroup freezer is hierarchical. Freezing a cgroup freezes all -tasks belonging to the cgroup and all its descendant cgroups. Each -cgroup has its own state (self-state) and the state inherited from the -parent (parent-state). Iff both states are THAWED, the cgroup is -THAWED. - -The following cgroupfs files are created by cgroup freezer. - -* freezer.state: Read-write. - - When read, returns the effective state of the cgroup - "THAWED", - "FREEZING" or "FROZEN". This is the combined self and parent-states. - If any is freezing, the cgroup is freezing (FREEZING or FROZEN). - - FREEZING cgroup transitions into FROZEN state when all tasks - belonging to the cgroup and its descendants become frozen. Note that - a cgroup reverts to FREEZING from FROZEN after a new task is added - to the cgroup or one of its descendant cgroups until the new task is - frozen. - - When written, sets the self-state of the cgroup. Two values are - allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, - if not already freezing, enters FREEZING state along with all its - descendant cgroups. - - If THAWED is written, the self-state of the cgroup is changed to - THAWED. Note that the effective state may not change to THAWED if - the parent-state is still freezing. If a cgroup's effective state - becomes THAWED, all its descendants which are freezing because of - the cgroup also leave the freezing state. - -* freezer.self_freezing: Read only. - - Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. - This value is 1 iff the last write to freezer.state was "FROZEN". - -* freezer.parent_freezing: Read only. - - Shows the parent-state. 0 if none of the cgroup's ancestors is - frozen; otherwise, 1. - -The root cgroup is non-freezable and the above interface files don't -exist. - -* Examples of usage : - - # mkdir /sys/fs/cgroup/freezer - # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer - # mkdir /sys/fs/cgroup/freezer/0 - # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks - -to get status of the freezer subsystem : - - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -to freeze all tasks in the container : - - # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - FREEZING - # cat /sys/fs/cgroup/freezer/0/freezer.state - FROZEN - -to unfreeze all tasks in the container : - - # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -This is the basic mechanism which should do the right thing for user space task -in a simple scenario. diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt deleted file mode 100644 index 106245c3a..000000000 --- a/Documentation/cgroups/hugetlb.txt +++ /dev/null @@ -1,45 +0,0 @@ -HugeTLB Controller -------------------- - -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - -HugeTLB controller can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -o hugetlb none /sys/fs/cgroup - -With the above step, the initial or the parent HugeTLB group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. - -New groups can be created under the parent group /sys/fs/cgroup. - -# cd /sys/fs/cgroup -# mkdir g1 -# echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. - -Brief summary of control files - - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit - -For a system supporting two hugepage size (16M and 16G) the control -files include: - -hugetlb.16GB.limit_in_bytes -hugetlb.16GB.max_usage_in_bytes -hugetlb.16GB.usage_in_bytes -hugetlb.16GB.failcnt -hugetlb.16MB.limit_in_bytes -hugetlb.16MB.max_usage_in_bytes -hugetlb.16MB.usage_in_bytes -hugetlb.16MB.failcnt diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt deleted file mode 100644 index 8870b0212..000000000 --- a/Documentation/cgroups/memcg_test.txt +++ /dev/null @@ -1,280 +0,0 @@ -Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2010/2 -Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/cgroups/memory.txt) - -0. How to record usage ? - 2 objects are used. - - page_cgroup ....an object per page. - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_try_charge() - -2. Uncharge - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge() - Called when a page's refcount goes down to 0. - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - -3. charge-commit-cancel - Memcg pages are charged in two steps: - mem_cgroup_try_charge() - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the page is associated with the memcg. - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - -5. Page Cache - Page Cache is charged at - - add_to_page_cache_locked(). - - The logic is very clear. (About migration, see below) - Note: __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache - The best way to understand shmem's page state transition is to read - mm/shmem.c. - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - -7. Page Migration - - mem_cgroup_migrate() - -8. LRU - Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global zone->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone->lru_lock(). - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. - - Tests for racy cases. - - 9.1 Small limit to memcg. - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - - 9.2 Shmem - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - - 9.3 Migration - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration. - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset. - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - 9.4 Memory hotplug. - memory hotplug test is one of good test. - to offline memory, do following. - # echo offline > /sys/devices/system/memory/memoryXXX/state - (XXX is the place of memory) - This is an easy way to test page migration, too. - - 9.5 mkdir/rmdir - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following. - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running. - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - - 9.6 Mount with other subsystems. - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example) - # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. - - 9.7 swapoff. - Besides management of swap is one of complicated parts of memcg, - call path of swap-in at swapoff is not same as usual swap-in path.. - It's worth to be tested explicitly. - - For example, test like following is good. - (Shell-A) - # mount -t cgroup none /cgroup -o memory - # mkdir /cgroup/test - # echo 40M > /cgroup/test/memory.limit_in_bytes - # echo 0 > /cgroup/test/tasks - Run malloc(100M) program under this. You'll see 60M of swaps. - (Shell-B) - # move all tasks in /cgroup/test to /cgroup - # /sbin/swapoff -a - # rmdir /cgroup/test - # kill malloc task. - - Of course, tmpfs v.s. swapoff test should be tested, too. - - 9.8 OOM-Killer - Out-of-memory caused by memcg's limit will kill tasks under - the memcg. When hierarchy is used, a task under hierarchy - will be killed by the kernel. - In this case, panic_on_oom shouldn't be invoked and tasks - in other groups shouldn't be killed. - - It's not difficult to cause OOM under memcg as following. - Case A) when you can swapoff - #swapoff -a - #echo 50M > /memory.limit_in_bytes - run 51M of malloc - - Case B) when you use mem+swap limitation. - #echo 50M > memory.limit_in_bytes - #echo 50M > memory.memsw.limit_in_bytes - run 51M of malloc - - 9.9 Move charges at task migration - Charges associated with a task can be moved along with task migration. - - (Shell-A) - #mkdir /cgroup/A - #echo $$ >/cgroup/A/tasks - run some programs which uses some amount of memory in /cgroup/A. - - (Shell-B) - #mkdir /cgroup/B - #echo 1 >/cgroup/B/memory.move_charge_at_immigrate - #echo "pid of the program running in group A" >/cgroup/B/tasks - - You can see charges have been moved by reading *.usage_in_bytes or - memory.stat of both A and B. - See 8.2 of Documentation/cgroups/memory.txt to see what value should be - written to move_charge_at_immigrate. - - 9.10 Memory thresholds - Memory controller implements memory thresholds using cgroups notification - API. You can use tools/cgroup/cgroup_event_listener.c to test it. - - (Shell-A) Create cgroup and run event listener - # mkdir /cgroup/A - # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M - - (Shell-B) Add task to cgroup and try to allocate and free memory - # echo $$ >/cgroup/A/tasks - # a="$(dd if=/dev/zero bs=1M count=10)" - # a= - - You will see message from cgroup_event_listener every time you cross - the thresholds. - - Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. - - It's good idea to test root cgroup as well. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt deleted file mode 100644 index ff71e16cc..000000000 --- a/Documentation/cgroups/memory.txt +++ /dev/null @@ -1,876 +0,0 @@ -Memory Resource Controller - -NOTE: This document is hopelessly outdated and it asks for a complete - rewrite. It still contains a useful information so we are keeping it - here but make sure to check the current code if you need a deeper - understanding. - -NOTE: The Memory Resource Controller has generically been referred to as the - memory controller in this document. Do not confuse memory controller - used here with the memory controller that is used in hardware. - -(For editors) -In this document: - When we mention a cgroup (cgroupfs's directory) with memory controller, - we call it "memory cgroup". When you see git-log and source code, you'll - see patch's title and function names tend to use "memcg". - In this document, we avoid using it. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory-hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with a limited amount of memory; this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases; find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -Current Status: linux-2.6.34-mmotm(development version of 2010/April) - -Features: - - accounting anonymous pages, file caches, swap caches usage and limiting them. - - pages are linked to per-memcg LRU exclusively, and there is no global LRU. - - optionally, memory+swap usage can be accounted and limited. - - hierarchical accounting - - soft limit - - moving (recharging) account at moving a task is selectable. - - usage threshold notifier - - memory pressure notifier - - oom-killer disable knob and oom-notifier - - Root cgroup has no limit controls. - - Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) - -Brief summary of control files. - - tasks # attach a task(thread) and show list of threads - cgroup.procs # show list of processes - cgroup.event_control # an interface for event_fd() - memory.usage_in_bytes # show current usage for memory - (See 5.5 for details) - memory.memsw.usage_in_bytes # show current usage for memory+Swap - (See 5.5 for details) - memory.limit_in_bytes # set/show limit of memory usage - memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage - memory.failcnt # show the number of memory usage hits limits - memory.memsw.failcnt # show the number of memory+Swap hits limits - memory.max_usage_in_bytes # show max memory usage recorded - memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded - memory.soft_limit_in_bytes # set/show soft limit of memory usage - memory.stat # show various statistics - memory.use_hierarchy # set/show hierarchical account enabled - memory.force_empty # trigger forced move charge to parent - memory.pressure_level # set memory pressure notifications - memory.swappiness # set/show swappiness parameter of vmscan - (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate # set/show controls of moving charges - memory.oom_control # set/show oom controls. - memory.numa_stat # show the number of memory usage per numa node - - memory.kmem.limit_in_bytes # set/show hard limit for kernel memory - memory.kmem.usage_in_bytes # show current kernel memory allocation - memory.kmem.failcnt # show the number of kernel memory usage hits limits - memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded - - memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory - memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation - memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits - memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the page_counter. The -page_counter tracks the current memory usage and limit of the group of -processes associated with the controller. Each cgroup has a memory controller -specific data structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (page_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge_common() is invoked to -set up the necessary data structures and check if the cgroup that is being -charged is over its limit. If it is, then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -updated. page_cgroup has its own LRU on cgroup. -(*) page_cgroup structure is allocated at boot/memory-hotplug time. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -Some pages which are never reclaimable and will not be on the LRU -are not accounted. We just account pages under usual VM management. - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully -unmapped (by kswapd), they may exist as SwapCache in the system until they -are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. - -Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. - -At page migration, accounting information is kept. - -Note: we just account pages-on-LRU because our purpose is to control amount -of used pages; not-on-LRU pages tend to be out-of-control from VM view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. - -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) - -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -memsw means memory+swap. Usage of memory+swap is limited by -memsw.limit_in_bytes. - -Example: Assume a system with 4G of swap. A task which allocates 6G of memory -(by mistake) under 2G memory limitation will use all swap. -In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. -By using the memsw limit, you can avoid system OOM which can be caused by swap -shortage. - -* why 'memory+swap' rather than swap. -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -memory+swap. In other words, when we want to limit the usage of swap without -affecting global LRU, memory+swap limit is better than just limiting swap from -an OS point of view. - -* What happens when a cgroup hits memory.memsw.limit_in_bytes -When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out -in this cgroup. Then, swap-out will not be done by cgroup routine and file -caches are dropped. But as mentioned above, global LRU can do swapout memory -from it for sanity of the system's memory management state. You can't forbid -it by cgroup. - -2.5 Reclaim - -Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: Reclaim does not work for the root cgroup, since we cannot set any -limits on the root cgroup. - -Note2: When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) - -2.6 Locking - - lock_page_cgroup()/unlock_page_cgroup() should not be called under - mapping->tree_lock. - - Other lock order is following: - PG_locked. - mm->page_table_lock - zone->lru_lock - lock_page_cgroup. - In many cases, just lock_page_cgroup() is called. - per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - zone->lru_lock, it has no lock of its own. - -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) - -With the Kernel memory extension, the Memory Controller is able to limit -the amount of kernel memory used by the system. Kernel memory is fundamentally -different than user memory, since it can't be swapped out, which makes it -possible to DoS the system by consuming too much of this precious resource. - -Kernel memory won't be accounted at all until limit on a group is set. This -allows for existing setups to continue working without disruption. The limit -cannot be set if the cgroup have children, or if there are already tasks in the -cgroup. Attempting to set the limit under those conditions will return -EBUSY. -When use_hierarchy == 1 and a group is accounted, its children will -automatically be accounted regardless of their limit value. - -After a group is first limited, it will be kept being accounted until it -is removed. The memory limitation itself, can of course be removed by writing --1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not -limited. - -Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. The memory used is accumulated into -memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. -(currently only for tcp). -The main "kmem" counter is fed into the main counter, so kmem charges will -also be visible from the user counter. - -Currently no soft limit is implemented for kernel memory. It is future work -to trigger slab reclaim when those limits are reached. - -2.7.1 Current Kernel Memory resources accounted - -* stack pages: every process consumes some stack pages. By accounting into -kernel memory, we prevent new processes from being created when the kernel -memory usage is too high. - -* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy -of each kmem_cache is created every time the cache is touched by the first time -from inside the memcg. The creation is done lazily, so some objects can still be -skipped while the cache is being created. All objects in a slab page should -belong to the same memcg. This only fails to hold when a task is migrated to a -different memcg during the page allocation by the cache. - -* sockets memory pressure: some sockets protocols have memory pressure -thresholds. The Memory Controller allows them to be controlled individually -per cgroup, instead of globally. - -* tcp memory pressure: sockets memory pressure for the tcp protocol. - -2.7.2 Common use cases - -Because the "kmem" counter is fed to the main user counter, kernel memory can -never be limited completely independently of user memory. Say "U" is the user -limit, and "K" the kernel limit. There are three possible ways limits can be -set: - - U != 0, K = unlimited: - This is the standard memcg limitation mechanism already present before kmem - accounting. Kernel memory is completely ignored. - - U != 0, K < U: - Kernel memory is a subset of the user memory. This setup is useful in - deployments where the total amount of memory per-cgroup is overcommited. - Overcommiting kernel memory limits is definitely not recommended, since the - box can still run out of non-reclaimable memory. - In this case, the admin could set up K so that the sum of all groups is - never greater than the total memory, and freely set U at the cost of his - QoS. - WARNING: In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. - - U != 0, K >= U: - Since kmem charges will also be fed to the user counter and reclaim will be - triggered for the cgroup for both kinds of memory. This setup gives the - admin a unified view of memory, and it is also useful for people who just - want to track kernel memory usage. - -3. User Interface - -3.0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -# mount -t tmpfs none /sys/fs/cgroup -# mkdir /sys/fs/cgroup/memory -# mount -t cgroup none /sys/fs/cgroup/memory -o memory - -3.2. Make the new group and move bash into it -# mkdir /sys/fs/cgroup/memory/0 -# echo $$ > /sys/fs/cgroup/memory/0/tasks - -Since now we're in the 0 cgroup, we can alter the memory limit: -# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) - -NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). -NOTE: We cannot set limits on the root cgroup any more. - -# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes -4194304 - -We can check the usage: -# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful setting of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing - -For testing features and implementation, see memcg_test.txt. - -Performance test is also important. To see pure memory controller's overhead, -testing on tmpfs will give you good numbers of small overheads. -Example: do kernel make on tmpfs. - -Page-fault scalability is also important. At measuring parallel -page fault test, multi-process test may be better than multi-thread -test because it has noise of shared objects/status. - -But the above two are testing extreme situations. -Trying usual test under memory controller is always helpful. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated by the OOM killer. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. - -4.2 Task migration - -When a task migrates from one cgroup to another, its charge is not -carried forward by default. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -You can move charges of a task along with task migration. -See 8. "Move charges at task migration" - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) - -We move the stats to root (if use_hierarchy==0) or parent (if -use_hierarchy==1), and no change on the charge except uncharging -from the child. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - -About use_hierarchy, see Section 6. - -5. Misc. interfaces. - -5.1 force_empty - memory.force_empty interface is provided to make cgroup's memory usage empty. - When writing anything to this - - # echo 0 > memory.force_empty - - the cgroup will be reclaimed and as many pages reclaimed as possible. - - The typical use case for this interface is before calling rmdir(). - Because rmdir() moves all pages to parent, some out-of-use page caches can be - moved to the parent. If you want to avoid that, force_empty will be useful. - - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - - About use_hierarchy, see Section 6. - -5.2 stat file - -memory.stat file includes following statistics - -# per-memory cgroup local status -cache - # of bytes of page cache memory. -rss - # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge - # of bytes of anonymous transparent hugepages. -mapped_file - # of bytes of mapped file (includes tmpfs/shmem) -pgpgin - # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout - # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap - # of bytes of swap usage -dirty - # of bytes that are waiting to get written back to the disk. -writeback - # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon - # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon - # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file - # of bytes of file-backed memory on inactive LRU list. -active_file - # of bytes of file-backed memory on active LRU list. -unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). - -# status considering hierarchy (see memory.use_hierarchy settings) - -hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ - # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache - -# The following additional stats are dependent on CONFIG_DEBUG_VM. - -recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) -recent_rotated_file - VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) -recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - -Memo: - recent_rotated means recent frequency of LRU rotation. - recent_scanned means recent # of scans to LRU. - showing for better debug please see the code for meanings. - -Note: - Only anonymous and swap cache memory is listed as part of 'rss' stat. - This should not be confused with the true 'resident set size' or the - amount of physical memory used by the cgroup. - 'rss + file_mapped" will give you resident set size of cgroup. - (Note: file and shmem may be shared among other cgroups. In that case, - file_mapped is accounted only when the memory cgroup is owner of page - cache.) - -5.3 swappiness - -Overrides /proc/sys/vm/swappiness for the particular group. The tunable -in the root cgroup corresponds to the global swappiness setting. - -Please note that unlike during the global reclaim, limit reclaim -enforces that 0 swappiness really prevents from any swapping even if -there is a swap storage available. This might lead to memcg OOM killer -if there are no file pages to reclaim. - -5.4 failcnt - -A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. -This failcnt(== failure count) shows the number of times that a usage counter -hit its limit. When a memory cgroup hits a limit, failcnt increases and -memory under it will be reclaimed. - -You can reset failcnt by writing 0 to failcnt file. -# echo 0 > .../memory.failcnt - -5.5 usage_in_bytes - -For efficiency, as other kernel components, memory cgroup uses some optimization -to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the -method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz -value for efficient access. (Of course, when necessary, it's synchronized.) -If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) -value in memory.stat(see 5.2). - -5.6 numa_stat - -This is similar to numa_maps but operates on a per-memcg basis. This is -useful for providing visibility into the numa locality information within -an memcg since the pages are allowed to be allocated from any physical -node. One of the use cases is evaluating application performance by -combining this information with the application's CPU allocation. - -Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" -per-node page counts including "hierarchical_" which sums up all -hierarchical children's values in addition to the memcg's own value. - -The output format of memory.numa_stat is: - -total= N0= N1= ... -file= N0= N1= ... -anon= N0= N1= ... -unevictable= N0= N1= ... -hierarchical_= N0= N1= ... - -The "total" count is sum of file + anon + unevictable. - -6. Hierarchy support - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim - -A memory cgroup by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup - -# echo 1 > memory.use_hierarchy - -The feature can be disabled by - -# echo 0 > memory.use_hierarchy - -NOTE1: Enabling/disabling will fail if either the cgroup already has other - cgroups created below it, or if the parent cgroup has use_hierarchy - enabled. - -NOTE2: When panic_on_oom is set to "2", the whole system will panic in - case of an OOM event in any cgroup. - -7. Soft limits - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory, control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best-effort feature; it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is set up such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 MiB) - -# echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use - -# echo 1G > memory.soft_limit_in_bytes - -NOTE1: Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. Move charges at task migration - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it: - -# echo (some positive value) > memory.move_charge_at_immigrate - -Note: Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. -Note: Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. -Note: If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. -Note: It can take several seconds if you move charges much. - -And if you want disable it again: - -# echo 0 > memory.move_charge_at_immigrate - -8.2 Type of charges which can be moved - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - - bit | what type of charges would be moved ? - -----+------------------------------------------------------------------------ - 0 | A charge of an anonymous page (or swap of it) used by the target task. - | You must enable Swap Extension (see 2.4) to enable move of swap charges. - -----+------------------------------------------------------------------------ - 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) - | and swaps of tmpfs file) mmapped by the target task. Unlike the case of - | anonymous pages, file pages (and swaps) in the range mmapped by the task - | will be moved even if the task hasn't done page fault, i.e. they might - | not be the task's "RSS", but other task's "RSS" that maps the same file. - | And mapcount of the page is ignored (the page can be moved even if - | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to - | enable move of swap charges. - -8.3 TODO - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. - -9. Memory thresholds - -Memory cgroup implements memory thresholds using the cgroups notification -API (see cgroups.txt). It allows to register multiple memory and memsw -thresholds and gets notifications when it crosses. - -To register a threshold, an application must: -- create an eventfd using eventfd(2); -- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; -- write string like " " to - cgroup.event_control. - -Application will be notified through eventfd when memory usage crosses -threshold in any direction. - -It's applicable for root and non-root cgroup. - -10. OOM Control - -memory.oom_control file is for OOM notification and other controls. - -Memory cgroup implements OOM notifier using the cgroup notification -API (See cgroups.txt). It allows to register multiple OOM notification -delivery and gets notification when OOM happens. - -To register a notifier, an application must: - - create an eventfd using eventfd(2) - - open memory.oom_control file - - write string like " " to - cgroup.event_control - -The application will be notified through eventfd when OOM happens. -OOM notification doesn't work for the root cgroup. - -You can disable the OOM-killer by writing "1" to memory.oom_control file, as: - - #echo 1 > memory.oom_control - -If OOM-killer is disabled, tasks under cgroup will hang/sleep -in memory cgroup's OOM-waitqueue when they request accountable memory. - -For running them, you have to relax the memory cgroup's OOM status by - * enlarge limit or reduce usage. -To reduce usage, - * kill some tasks. - * move some tasks to other group with account migration. - * remove some files (on tmpfs?) - -Then, stopped tasks will work again. - -At reading, current status of OOM is shown. - oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) - under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may - be stopped.) - -11. Memory Pressure - -The pressure level notifications can be used to monitor the memory -allocation cost; based on the pressure, applications can implement -different strategies of managing their memory resources. The pressure -levels are defined as following: - -The "low" level means that the system is reclaiming memory for new -allocations. Monitoring this reclaiming activity might be useful for -maintaining cache level. Upon notification, the program (typically -"Activity Manager") might analyze vmstat and act in advance (i.e. -prematurely shutdown unimportant services). - -The "medium" level means that the system is experiencing medium memory -pressure, the system might be making swap, paging out active file caches, -etc. Upon this event applications may decide to further analyze -vmstat/zoneinfo/memcg or internal memory usage statistics and free any -resources that can be easily reconstructed or re-read from a disk. - -The "critical" level means that the system is actively thrashing, it is -about to out of memory (OOM) or even the in-kernel OOM killer is on its -way to trigger. Applications should do whatever they can to help the -system. It might be too late to consult with vmstat or any other -statistics, so it's advisable to take an immediate action. - -The events are propagated upward until the event is handled, i.e. the -events are not pass-through. Here is what this means: for example you have -three cgroups: A->B->C. Now you set up an event listener on cgroups A, B -and C, and suppose group C experiences some pressure. In this situation, -only group C will receive the notification, i.e. groups A and B will not -receive it. This is done to avoid excessive "broadcasting" of messages, -which disturbs the system and which is especially bad if we are low on -memory or thrashing. So, organize the cgroups wisely, or propagate the -events manually (or, ask us to implement the pass-through events, -explaining why would you need them.) - -The file memory.pressure_level is only used to setup an eventfd. To -register a notification, an application must: - -- create an eventfd using eventfd(2); -- open memory.pressure_level; -- write string like " " - to cgroup.event_control. - -Application will be notified through eventfd when memory pressure is at -the specific level (or higher). Read/write operations to -memory.pressure_level are no implemented. - -Test: - - Here is a small script example that makes a new cgroup, sets up a - memory limit, sets up a notification in the cgroup and then makes child - cgroup experience a critical pressure: - - # cd /sys/fs/cgroup/memory/ - # mkdir foo - # cd foo - # cgroup_event_listener memory.pressure_level low & - # echo 8000000 > memory.limit_in_bytes - # echo 8000000 > memory.memsw.limit_in_bytes - # echo $$ > tasks - # dd if=/dev/zero | read x - - (Expect a bunch of notifications, and eventually, the oom-killer will - trigger.) - -12. TODO - -1. Make per-cgroup scanner reclaim not-shared pages first -2. Teach controller to account for shared-pages -3. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroups/net_cls.txt b/Documentation/cgroups/net_cls.txt deleted file mode 100644 index ec182346d..000000000 --- a/Documentation/cgroups/net_cls.txt +++ /dev/null @@ -1,39 +0,0 @@ -Network classifier cgroup -------------------------- - -The Network classifier cgroup provides an interface to -tag network packets with a class identifier (classid). - -The Traffic Controller (tc) can be used to assign -different priorities to packets from different cgroups. -Also, Netfilter (iptables) can use this tag to perform -actions on such packets. - -Creating a net_cls cgroups instance creates a net_cls.classid file. -This net_cls.classid value is initialized to 0. - -You can write hexadecimal values to net_cls.classid; the format for these -values is 0xAAAABBBB; AAAA is the major handle number and BBBB -is the minor handle number. -Reading net_cls.classid yields a decimal result. - -Example: -mkdir /sys/fs/cgroup/net_cls -mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls -mkdir /sys/fs/cgroup/net_cls/0 -echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid - - setting a 10:1 handle. - -cat /sys/fs/cgroup/net_cls/0/net_cls.classid -1048577 - -configuring tc: -tc qdisc add dev eth0 root handle 10: htb - -tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit - - creating traffic class 10:1 - -tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup - -configuring iptables, basic example: -iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt deleted file mode 100644 index a82cbd28e..000000000 --- a/Documentation/cgroups/net_prio.txt +++ /dev/null @@ -1,55 +0,0 @@ -Network priority cgroup -------------------------- - -The Network priority cgroup provides an interface to allow an administrator to -dynamically set the priority of network traffic generated by various -applications - -Nominally, an application would set the priority of its traffic via the -SO_PRIORITY socket option. This however, is not always possible because: - -1) The application may not have been coded to set this value -2) The priority of application traffic is often a site-specific administrative - decision rather than an application defined one. - -This cgroup allows an administrator to assign a process to a group which defines -the priority of egress traffic on a given interface. Network priority groups can -be created by first mounting the cgroup filesystem. - -# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio - -With the above step, the initial group acting as the parent accounting group -becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in -the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. - -Each net_prio cgroup contains two files that are subsystem specific - -net_prio.prioidx -This file is read-only, and is simply informative. It contains a unique integer -value that the kernel uses as an internal representation of this cgroup. - -net_prio.ifpriomap -This file contains a map of the priorities assigned to traffic originating from -processes in this group and egressing the system on various interfaces. It -contains a list of tuples in the form . Contents of this file -can be modified by echoing a string into the file using the same tuple format. -for example: - -echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap - -This command would force any traffic originating from processes belonging to the -iscsi net_prio cgroup and egressing on interface eth0 to have the priority of -said traffic set to the value 5. The parent accounting group also has a -writeable 'net_prio.ifpriomap' file that can be used to set a system default -priority. - -Priorities are set immediately prior to queueing a frame to the device -queueing discipline (qdisc) so priorities will be assigned prior to the hardware -queue selection being made. - -One usage for the net_prio cgroup is with mqprio qdisc allowing application -traffic to be steered to hardware/driver based traffic classes. These mappings -can then be managed by administrators or other networking protocols such as -DCBX. - -A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt deleted file mode 100644 index 1a078b5d2..000000000 --- a/Documentation/cgroups/pids.txt +++ /dev/null @@ -1,85 +0,0 @@ - Process Number Controller - ========================= - -Abstract --------- - -The process number controller is used to allow a cgroup hierarchy to stop any -new tasks from being fork()'d or clone()'d after a certain limit is reached. - -Since it is trivial to hit the task limit without hitting any kmemcg limits in -place, PIDs are a fundamental resource. As such, PID exhaustion must be -preventable in the scope of a cgroup hierarchy by allowing resource limiting of -the number of tasks in a cgroup. - -Usage ------ - -In order to use the `pids` controller, set the maximum number of tasks in -pids.max (this is not available in the root cgroup for obvious reasons). The -number of processes currently in the cgroup is given by pids.current. - -Organisational operations are not blocked by cgroup policies, so it is possible -to have pids.current > pids.max. This can be done by either setting the limit to -be smaller than pids.current, or attaching enough processes to the cgroup such -that pids.current > pids.max. However, it is not possible to violate a cgroup -policy through fork() or clone(). fork() and clone() will return -EAGAIN if the -creation of a new process would cause a cgroup policy to be violated. - -To set a cgroup to have no limit, set pids.max to "max". This is the default for -all new cgroups (N.B. that PID limits are hierarchical, so the most stringent -limit in the hierarchy is followed). - -pids.current tracks all child cgroup hierarchies, so parent/pids.current is a -superset of parent/child/pids.current. - -Example -------- - -First, we mount the pids controller: -# mkdir -p /sys/fs/cgroup/pids -# mount -t cgroup -o pids none /sys/fs/cgroup/pids - -Then we create a hierarchy, set limits and attach processes to it: -# mkdir -p /sys/fs/cgroup/pids/parent/child -# echo 2 > /sys/fs/cgroup/pids/parent/pids.max -# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# - -It should be noted that attempts to overcome the set limit (2 in this case) will -fail: - -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# ( /bin/echo "Here's some processes for you." | cat ) -sh: fork: Resource temporary unavailable -# - -Even if we migrate to a child cgroup (which doesn't have a set limit), we will -not be able to overcome the most stringent limit in the hierarchy (in this case, -parent's): - -# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# cat /sys/fs/cgroup/pids/parent/child/pids.current -2 -# cat /sys/fs/cgroup/pids/parent/child/pids.max -max -# ( /bin/echo "Here's some processes for you." | cat ) -sh: fork: Resource temporary unavailable -# - -We can set a limit that is smaller than pids.current, which will stop any new -processes from being forked at all (note that the shell itself counts towards -pids.current): - -# echo 1 > /sys/fs/cgroup/pids/parent/pids.max -# /bin/echo "We can't even spawn a single process now." -sh: fork: Resource temporary unavailable -# echo 0 > /sys/fs/cgroup/pids/parent/pids.max -# /bin/echo "We can't even spawn a single process now." -sh: fork: Resource temporary unavailable -# diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt deleted file mode 100644 index 781b1d475..000000000 --- a/Documentation/cgroups/unified-hierarchy.txt +++ /dev/null @@ -1,647 +0,0 @@ - -Cgroup unified hierarchy - -April, 2014 Tejun Heo - -This document describes the changes made by unified hierarchy and -their rationales. It will eventually be merged into the main cgroup -documentation. - -CONTENTS - -1. Background -2. Basic Operation - 2-1. Mounting - 2-2. cgroup.subtree_control - 2-3. cgroup.controllers -3. Structural Constraints - 3-1. Top-down - 3-2. No internal tasks -4. Delegation - 4-1. Model of delegation - 4-2. Common ancestor rule -5. Other Changes - 5-1. [Un]populated Notification - 5-2. Other Core Changes - 5-3. Controller File Conventions - 5-3-1. Format - 5-3-2. Control Knobs - 5-4. Per-Controller Changes - 5-4-1. io - 5-4-2. cpuset - 5-4-3. memory -6. Planned Changes - 6-1. CAP for resource control - - -1. Background - -cgroup allows an arbitrary number of hierarchies and each hierarchy -can host any number of controllers. While this seems to provide a -high level of flexibility, it isn't quite useful in practice. - -For example, as there is only one instance of each controller, utility -type controllers such as freezer which can be useful in all -hierarchies can only be used in one. The issue is exacerbated by the -fact that controllers can't be moved around once hierarchies are -populated. Another issue is that all controllers bound to a hierarchy -are forced to have exactly the same view of the hierarchy. It isn't -possible to vary the granularity depending on the specific controller. - -In practice, these issues heavily limit which controllers can be put -on the same hierarchy and most configurations resort to putting each -controller on its own hierarchy. Only closely related ones, such as -the cpu and cpuacct controllers, make sense to put on the same -hierarchy. This often means that userland ends up managing multiple -similar hierarchies repeating the same steps on each hierarchy -whenever a hierarchy management operation is necessary. - -Unfortunately, support for multiple hierarchies comes at a steep cost. -Internal implementation in cgroup core proper is dazzlingly -complicated but more importantly the support for multiple hierarchies -restricts how cgroup is used in general and what controllers can do. - -There's no limit on how many hierarchies there may be, which means -that a task's cgroup membership can't be described in finite length. -The key may contain any varying number of entries and is unlimited in -length, which makes it highly awkward to handle and leads to addition -of controllers which exist only to identify membership, which in turn -exacerbates the original problem. - -Also, as a controller can't have any expectation regarding what shape -of hierarchies other controllers would be on, each controller has to -assume that all other controllers are operating on completely -orthogonal hierarchies. This makes it impossible, or at least very -cumbersome, for controllers to cooperate with each other. - -In most use cases, putting controllers on hierarchies which are -completely orthogonal to each other isn't necessary. What usually is -called for is the ability to have differing levels of granularity -depending on the specific controller. In other words, hierarchy may -be collapsed from leaf towards root when viewed from specific -controllers. For example, a given configuration might not care about -how memory is distributed beyond a certain level while still wanting -to control how CPU cycles are distributed. - -Unified hierarchy is the next version of cgroup interface. It aims to -address the aforementioned issues by having more structure while -retaining enough flexibility for most use cases. Various other -general and controller-specific interface issues are also addressed in -the process. - - -2. Basic Operation - -2-1. Mounting - -Currently, unified hierarchy can be mounted with the following mount -command. Note that this is still under development and scheduled to -change soon. - - mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT - -All controllers which support the unified hierarchy and are not bound -to other hierarchies are automatically bound to unified hierarchy and -show up at the root of it. Controllers which are enabled only in the -root of unified hierarchy can be bound to other hierarchies. This -allows mixing unified hierarchy with the traditional multiple -hierarchies in a fully backward compatible way. - -A controller can be moved across hierarchies only after the controller -is no longer referenced in its current hierarchy. Because per-cgroup -controller states are destroyed asynchronously and controllers may -have lingering references, a controller may not show up immediately on -the unified hierarchy after the final umount of the previous -hierarchy. Similarly, a controller should be fully disabled to be -moved out of the unified hierarchy and it may take some time for the -disabled controller to become available for other hierarchies; -furthermore, due to dependencies among controllers, other controllers -may need to be disabled too. - -While useful for development and manual configurations, dynamically -moving controllers between the unified and other hierarchies is -strongly discouraged for production use. It is recommended to decide -the hierarchies and controller associations before starting using the -controllers. - - -2-2. cgroup.subtree_control - -All cgroups on unified hierarchy have a "cgroup.subtree_control" file -which governs which controllers are enabled on the children of the -cgroup. Let's assume a hierarchy like the following. - - root - A - B - C - \ D - -root's "cgroup.subtree_control" file determines which controllers are -enabled on A. A's on B. B's on C and D. This coincides with the -fact that controllers on the immediate sub-level are used to -distribute the resources of the parent. In fact, it's natural to -assume that resource control knobs of a child belong to its parent. -Enabling a controller in a "cgroup.subtree_control" file declares that -distribution of the respective resources of the cgroup will be -controlled. Note that this means that controller enable states are -shared among siblings. - -When read, the file contains a space-separated list of currently -enabled controllers. A write to the file should contain a -space-separated list of controllers with '+' or '-' prefixed (without -the quotes). Controllers prefixed with '+' are enabled and '-' -disabled. If a controller is listed multiple times, the last entry -wins. The specific operations are executed atomically - either all -succeed or fail. - - -2-3. cgroup.controllers - -Read-only "cgroup.controllers" file contains a space-separated list of -controllers which can be enabled in the cgroup's -"cgroup.subtree_control" file. - -In the root cgroup, this lists controllers which are not bound to -other hierarchies and the content changes as controllers are bound to -and unbound from other hierarchies. - -In non-root cgroups, the content of this file equals that of the -parent's "cgroup.subtree_control" file as only controllers enabled -from the parent can be used in its children. - - -3. Structural Constraints - -3-1. Top-down - -As it doesn't make sense to nest control of an uncontrolled resource, -all non-root "cgroup.subtree_control" files can only contain -controllers which are enabled in the parent's "cgroup.subtree_control" -file. A controller can be enabled only if the parent has the -controller enabled and a controller can't be disabled if one or more -children have it enabled. - - -3-2. No internal tasks - -One long-standing issue that cgroup faces is the competition between -tasks belonging to the parent cgroup and its children cgroups. This -is inherently nasty as two different types of entities compete and -there is no agreed-upon obvious way to handle it. Different -controllers are doing different things. - -The cpu controller considers tasks and cgroups as equivalents and maps -nice levels to cgroup weights. This works for some cases but falls -flat when children should be allocated specific ratios of CPU cycles -and the number of internal tasks fluctuates - the ratios constantly -change as the number of competing entities fluctuates. There also are -other issues. The mapping from nice level to weight isn't obvious or -universal, and there are various other knobs which simply aren't -available for tasks. - -The io controller implicitly creates a hidden leaf node for each -cgroup to host the tasks. The hidden leaf has its own copies of all -the knobs with "leaf_" prefixed. While this allows equivalent control -over internal tasks, it's with serious drawbacks. It always adds an -extra layer of nesting which may not be necessary, makes the interface -messy and significantly complicates the implementation. - -The memory controller currently doesn't have a way to control what -happens between internal tasks and child cgroups and the behavior is -not clearly defined. There have been attempts to add ad-hoc behaviors -and knobs to tailor the behavior to specific workloads. Continuing -this direction will lead to problems which will be extremely difficult -to resolve in the long term. - -Multiple controllers struggle with internal tasks and came up with -different ways to deal with it; unfortunately, all the approaches in -use now are severely flawed and, furthermore, the widely different -behaviors make cgroup as whole highly inconsistent. - -It is clear that this is something which needs to be addressed from -cgroup core proper in a uniform way so that controllers don't need to -worry about it and cgroup as a whole shows a consistent and logical -behavior. To achieve that, unified hierarchy enforces the following -structural constraint: - - Except for the root, only cgroups which don't contain any task may - have controllers enabled in their "cgroup.subtree_control" files. - -Combined with other properties, this guarantees that, when a -controller is looking at the part of the hierarchy which has it -enabled, tasks are always only on the leaves. This rules out -situations where child cgroups compete against internal tasks of the -parent. - -There are two things to note. Firstly, the root cgroup is exempt from -the restriction. Root contains tasks and anonymous resource -consumption which can't be associated with any other cgroup and -requires special treatment from most controllers. How resource -consumption in the root cgroup is governed is up to each controller. - -Secondly, the restriction doesn't take effect if there is no enabled -controller in the cgroup's "cgroup.subtree_control" file. This is -important as otherwise it wouldn't be possible to create children of a -populated cgroup. To control resource distribution of a cgroup, the -cgroup must create children and transfer all its tasks to the children -before enabling controllers in its "cgroup.subtree_control" file. - - -4. Delegation - -4-1. Model of delegation - -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" file to the user. Note -that the resource control knobs in a given directory concern the -resources of the parent and thus must not be delegated along with the -directory. - -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it got from the parent. The limits and other settings of all resource -controllers are hierarchical and regardless of what happens in the -delegated sub-hierarchy, nothing can escape the resource restrictions -imposed by the parent. - -Currently, cgroup doesn't impose any restrictions on the number of -cgroups in or nesting depth of a delegated sub-hierarchy; however, -this may in the future be limited explicitly. - - -4-2. Common ancestor rule - -On the unified hierarchy, to write to a "cgroup.procs" file, in -addition to the usual write permission to the file and uid match, the -writer must also have write access to the "cgroup.procs" file of the -common ancestor of the source and destination cgroups. This prevents -delegatees from smuggling processes across disjoint sub-hierarchies. - -Let's say cgroups C0 and C1 have been delegated to user U0 who created -C00, C01 under C0 and C10 under C1 as follows. - - ~~~~~~~~~~~~~ - C0 - C00 - ~ cgroup ~ \ C01 - ~ hierarchy ~ - ~~~~~~~~~~~~~ - C1 - C10 - -C0 and C1 are separate entities in terms of resource distribution -regardless of their relative positions in the hierarchy. The -resources the processes under C0 are entitled to are controlled by -C0's ancestors and may be completely different from C1. It's clear -that the intention of delegating C0 to U0 is allowing U0 to organize -the processes under C0 and further control the distribution of C0's -resources. - -On traditional hierarchies, if a task has write access to "tasks" or -"cgroup.procs" file of a cgroup and its uid agrees with the target, it -can move the target to the cgroup. In the above example, U0 will not -only be able to move processes in each sub-hierarchy but also across -the two sub-hierarchies, effectively allowing it to violate the -organizational and resource restrictions implied by the hierarchical -structure above C0 and C1. - -On the unified hierarchy, let's say U0 wants to write the pid of a -process which has a matching uid and is currently in C10 into -"C00/cgroup.procs". U0 obviously has write access to the file and -migration permission on the process; however, the common ancestor of -the source cgroup C10 and the destination cgroup C00 is above the -points of delegation and U0 would not have write access to its -"cgroup.procs" and thus be denied with -EACCES. - - -5. Other Changes - -5-1. [Un]populated Notification - -cgroup users often need a way to determine when a cgroup's -subhierarchy becomes empty so that it can be cleaned up. cgroup -currently provides release_agent for it; unfortunately, this mechanism -is riddled with issues. - -- It delivers events by forking and execing a userland binary - specified as the release_agent. This is a long deprecated method of - notification delivery. It's extremely heavy, slow and cumbersome to - integrate with larger infrastructure. - -- There is single monitoring point at the root. There's no way to - delegate management of a subtree. - -- The event isn't recursive. It triggers when a cgroup doesn't have - any tasks or child cgroups. Events for internal nodes trigger only - after all children are removed. This again makes it impossible to - delegate management of a subtree. - -- Events are filtered from the kernel side. A "notify_on_release" - file is used to subscribe to or suppress release events. This is - unnecessarily complicated and probably done this way because event - delivery itself was expensive. - -Unified hierarchy implements "populated" field in "cgroup.events" -interface file which can be used to monitor whether the cgroup's -subhierarchy has tasks in it or not. Its value is 0 if there is no -task in the cgroup and its descendants; otherwise, 1. poll and -[id]notify events are triggered when the value changes. - -This is significantly lighter and simpler and trivially allows -delegating management of subhierarchy - subhierarchy monitoring can -block further propagation simply by putting itself or another process -in the subhierarchy and monitor events that it's interested in from -there without interfering with monitoring higher in the tree. - -In unified hierarchy, the release_agent mechanism is no longer -supported and the interface files "release_agent" and -"notify_on_release" do not exist. - - -5-2. Other Core Changes - -- None of the mount options is allowed. - -- remount is disallowed. - -- rename(2) is disallowed. - -- The "tasks" file is removed. Everything should at process - granularity. Use the "cgroup.procs" file instead. - -- The "cgroup.procs" file is not sorted. pids will be unique unless - they got recycled in-between reads. - -- The "cgroup.clone_children" file is removed. - -- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged - to before exiting. If the cgroup is removed before the zombie is - reaped, " (deleted)" is appeneded to the path. - - -5-3. Controller File Conventions - -5-3-1. Format - -In general, all controller files should be in one of the following -formats whenever possible. - -- Values only files - - VAL0 VAL1...\n - -- Flat keyed files - - KEY0 VAL0\n - KEY1 VAL1\n - ... - -- Nested keyed files - - KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... - KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... - ... - -For a writeable file, the format for writing should generally match -reading; however, controllers may allow omitting later fields or -implement restricted shortcuts for most common use cases. - -For both flat and nested keyed files, only the values for a single key -can be written at a time. For nested keyed files, the sub key pairs -may be specified in any order and not all pairs have to be specified. - - -5-3-2. Control Knobs - -- Settings for a single feature should generally be implemented in a - single file. - -- In general, the root cgroup should be exempt from resource control - and thus shouldn't have resource control knobs. - -- If a controller implements ratio based resource distribution, the - control knob should be named "weight" and have the range [1, 10000] - and 100 should be the default value. The values are chosen to allow - enough and symmetric bias in both directions while keeping it - intuitive (the default is 100%). - -- If a controller implements an absolute resource guarantee and/or - limit, the control knobs should be named "min" and "max" - respectively. If a controller implements best effort resource - gurantee and/or limit, the control knobs should be named "low" and - "high" respectively. - - In the above four control files, the special token "max" should be - used to represent upward infinity for both reading and writing. - -- If a setting has configurable default value and specific overrides, - the default settings should be keyed with "default" and appear as - the first entry in the file. Specific entries can use "default" as - its value to indicate inheritance of the default value. - -- For events which are not very high frequency, an interface file - "events" should be created which lists event key value pairs. - Whenever a notifiable event happens, file modified event should be - generated on the file. - - -5-4. Per-Controller Changes - -5-4-1. io - -- blkio is renamed to io. The interface is overhauled anyway. The - new name is more in line with the other two major controllers, cpu - and memory, and better suited given that it may be used for cgroup - writeback without involving block layer. - -- Everything including stat is always hierarchical making separate - recursive stat files pointless and, as no internal node can have - tasks, leaf weights are meaningless. The operation model is - simplified and the interface is overhauled accordingly. - - io.stat - - The stat file. The reported stats are from the point where - bio's are issued to request_queue. The stats are counted - independent of which policies are enabled. Each line in the - file follows the following format. More fields may later be - added at the end. - - $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS - - io.weight - - The weight setting, currently only available and effective if - cfq-iosched is in use for the target device. The weight is - between 1 and 10000 and defaults to 100. The first line - always contains the default weight in the following format to - use when per-device setting is missing. - - default $WEIGHT - - Subsequent lines list per-device weights of the following - format. - - $MAJ:$MIN $WEIGHT - - Writing "$WEIGHT" or "default $WEIGHT" changes the default - setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight - while "$MAJ:$MIN default" clears it. - - This file is available only on non-root cgroups. - - io.max - - The maximum bandwidth and/or iops setting, only available if - blk-throttle is enabled. The file is of the following format. - - $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS - - ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are - read/write IOs per second. "max" indicates no limit. Writing - to the file follows the same format but the individual - settings may be omitted or specified in any order. - - This file is available only on non-root cgroups. - - -5-4-2. cpuset - -- Tasks are kept in empty cpusets after hotplug and take on the masks - of the nearest non-empty ancestor, instead of being moved to it. - -- A task can be moved into an empty cpuset, and again it takes on the - masks of the nearest non-empty ancestor. - - -5-4-3. memory - -- use_hierarchy is on by default and the cgroup file for the flag is - not created. - -- The original lower boundary, the soft limit, is defined as a limit - that is per default unset. As a result, the set of cgroups that - global reclaim prefers is opt-in, rather than opt-out. The costs - for optimizing these mostly negative lookups are so high that the - implementation, despite its enormous size, does not even provide the - basic desirable behavior. First off, the soft limit has no - hierarchical meaning. All configured groups are organized in a - global rbtree and treated like equal peers, regardless where they - are located in the hierarchy. This makes subtree delegation - impossible. Second, the soft limit reclaim pass is so aggressive - that it not just introduces high allocation latencies into the - system, but also impacts system performance due to overreclaim, to - the point where the feature becomes self-defeating. - - The memory.low boundary on the other hand is a top-down allocated - reserve. A cgroup enjoys reclaim protection when it and all its - ancestors are below their low boundaries, which makes delegation of - subtrees possible. Secondly, new cgroups have no reserve per - default and in the common case most cgroups are eligible for the - preferred reclaim pass. This allows the new low boundary to be - efficiently implemented with just a minor addition to the generic - reclaim code, without the need for out-of-band data structures and - reclaim passes. Because the generic reclaim code considers all - cgroups except for the ones running low in the preferred first - reclaim pass, overreclaim of individual groups is eliminated as - well, resulting in much better overall workload performance. - -- The original high boundary, the hard limit, is defined as a strict - limit that can not budge, even if the OOM killer has to be called. - But this generally goes against the goal of making the most out of - the available memory. The memory consumption of workloads varies - during runtime, and that requires users to overcommit. But doing - that with a strict upper limit requires either a fairly accurate - prediction of the working set size or adding slack to the limit. - Since working set size estimation is hard and error prone, and - getting it wrong results in OOM kills, most users tend to err on the - side of a looser limit and end up wasting precious resources. - - The memory.high boundary on the other hand can be set much more - conservatively. When hit, it throttles allocations by forcing them - into direct reclaim to work off the excess, but it never invokes the - OOM killer. As a result, a high boundary that is chosen too - aggressively will not terminate the processes, but instead it will - lead to gradual performance degradation. The user can monitor this - and make corrections until the minimal memory footprint that still - gives acceptable performance is found. - - In extreme cases, with many concurrent allocations and a complete - breakdown of reclaim progress within the group, the high boundary - can be exceeded. But even then it's mostly better to satisfy the - allocation from the slack available in other groups or the rest of - the system than killing the group. Otherwise, memory.max is there - to limit this type of spillover and ultimately contain buggy or even - malicious applications. - -- The original control file names are unwieldy and inconsistent in - many different ways. For example, the upper boundary hit count is - exported in the memory.failcnt file, but an OOM event count has to - be manually counted by listening to memory.oom_control events, and - lower boundary / soft limit events have to be counted by first - setting a threshold for that value and then counting those events. - Also, usage and limit files encode their units in the filename. - That makes the filenames very long, even though this is not - information that a user needs to be reminded of every time they type - out those names. - - To address these naming issues, as well as to signal clearly that - the new interface carries a new configuration model, the naming - conventions in it necessarily differ from the old interface. - -- The original limit files indicate the state of an unset limit with a - Very High Number, and a configured limit can be unset by echoing -1 - into those files. But that very high number is implementation and - architecture dependent and not very descriptive. And while -1 can - be understood as an underflow into the highest possible value, -2 or - -10M etc. do not work, so it's not consistent. - - memory.low, memory.high, and memory.max will use the string "max" to - indicate and set the highest possible value. - -6. Planned Changes - -6-1. CAP for resource control - -Unified hierarchy will require one of the capabilities(7), which is -yet to be decided, for all resource control related knobs. Process -organization operations - creation of sub-cgroups and migration of -processes in sub-hierarchies may be delegated by changing the -ownership and/or permissions on the cgroup directory and -"cgroup.procs" interface file; however, all operations which affect -resource control - writes to a "cgroup.subtree_control" file or any -controller-specific knobs - will require an explicit CAP privilege. - -This, in part, is to prevent the cgroup interface from being -inadvertently promoted to programmable API used by non-privileged -binaries. cgroup exposes various aspects of the system in ways which -aren't properly abstracted for direct consumption by regular programs. -This is an administration interface much closer to sysctl knobs than -system calls. Even the basic access model, being filesystem path -based, isn't suitable for direct consumption. There's no way to -access "my cgroup" in a race-free way or make multiple operations -atomic against migration to another cgroup. - -Another aspect is that, for better or for worse, the cgroup interface -goes through far less scrutiny than regular interfaces for -unprivileged userland. The upside is that cgroup is able to expose -useful features which may not be suitable for general consumption in a -reasonable time frame. It provides a relatively short path between -internal details and userland-visible interface. Of course, this -shortcut comes with high risk. We go through what we go through for -general kernel APIs for good reasons. It may end up leaking internal -details in a way which can exert significant pain by locking the -kernel into a contract that can't be maintained in a reasonable -manner. - -Also, due to the specific nature, cgroup and its controllers don't -tend to attract attention from a wide scope of developers. cgroup's -short history is already fraught with severely mis-designed -interfaces, unnecessary commitments to and exposing of internal -details, broken and dangerous implementations of various features. - -Keeping cgroup as an administration interface is both advantageous for -its role and imperative given its nature. Some of the cgroup features -may make sense for unprivileged access. If deemed justified, those -must be further abstracted and implemented as a different interface, -be it a system call or process-private filesystem, and survive through -the scrutiny that any interface for general consumption is required to -go through. - -Requiring CAP is not a complete solution but should serve as a -significant deterrent against spraying cgroup usages in non-privileged -programs. diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt index be8d4006b..f7b12c071 100644 --- a/Documentation/cpu-freq/intel-pstate.txt +++ b/Documentation/cpu-freq/intel-pstate.txt @@ -1,61 +1,131 @@ -Intel P-state driver +Intel P-State driver -------------------- -This driver provides an interface to control the P state selection for -SandyBridge+ Intel processors. The driver can operate two different -modes based on the processor model, legacy mode and Hardware P state (HWP) -mode. - -In legacy mode, the Intel P-state implements two internal governors, -performance and powersave, that differ from the general cpufreq governors of -the same name (the general cpufreq governors implement target(), whereas the -internal Intel P-state governors implement setpolicy()). The internal -performance governor sets the max_perf_pct and min_perf_pct to 100; that is, -the governor selects the highest available P state to maximize the performance -of the core. The internal powersave governor selects the appropriate P state -based on the current load on the CPU. - -In HWP mode P state selection is implemented in the processor -itself. The driver provides the interfaces between the cpufreq core and -the processor to control P state selection based on user preferences -and reporting frequency to the cpufreq core. In this mode the -internal Intel P-state governor code is disabled. - -In addition to the interfaces provided by the cpufreq core for -controlling frequency the driver provides sysfs files for -controlling P state selection. These files have been added to -/sys/devices/system/cpu/intel_pstate/ - - max_perf_pct: limits the maximum P state that will be requested by - the driver stated as a percentage of the available performance. The - available (P states) performance may be reduced by the no_turbo +This driver provides an interface to control the P-State selection for the +SandyBridge+ Intel processors. + +The following document explains P-States: +http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf +As stated in the document, P-State doesn’t exactly mean a frequency. However, for +the sake of the relationship with cpufreq, P-State and frequency are used +interchangeably. + +Understanding the cpufreq core governors and policies are important before +discussing more details about the Intel P-State driver. Based on what callbacks +a cpufreq driver provides to the cpufreq core, it can support two types of +drivers: +- with target_index() callback: In this mode, the drivers using cpufreq core +simply provide the minimum and maximum frequency limits and an additional +interface target_index() to set the current frequency. The cpufreq subsystem +has a number of scaling governors ("performance", "powersave", "ondemand", +etc.). Depending on which governor is in use, cpufreq core will call for +transitions to a specific frequency using target_index() callback. +- setpolicy() callback: In this mode, drivers do not provide target_index() +callback, so cpufreq core can't request a transition to a specific frequency. +The driver provides minimum and maximum frequency limits and callbacks to set a +policy. The policy in cpufreq sysfs is referred to as the "scaling governor". +The cpufreq core can request the driver to operate in any of the two policies: +"performance: and "powersave". The driver decides which frequency to use based +on the above policy selection considering minimum and maximum frequency limits. + +The Intel P-State driver falls under the latter category, which implements the +setpolicy() callback. This driver decides what P-State to use based on the +requested policy from the cpufreq core. If the processor is capable of +selecting its next P-State internally, then the driver will offload this +responsibility to the processor (aka HWP: Hardware P-States). If not, the +driver implements algorithms to select the next P-State. + +Since these policies are implemented in the driver, they are not same as the +cpufreq scaling governors implementation, even if they have the same name in +the cpufreq sysfs (scaling_governors). For example the "performance" policy is +similar to cpufreq’s "performance" governor, but "powersave" is completely +different than the cpufreq "powersave" governor. The strategy here is similar +to cpufreq "ondemand", where the requested P-State is related to the system load. + +Sysfs Interface + +In addition to the frequency-controlling interfaces provided by the cpufreq +core, the driver provides its own sysfs files to control the P-State selection. +These files have been added to /sys/devices/system/cpu/intel_pstate/. +Any changes made to these files are applicable to all CPUs (even in a +multi-package system). + + max_perf_pct: Limits the maximum P-State that will be requested by + the driver. It states it as a percentage of the available performance. The + available (P-State) performance may be reduced by the no_turbo setting described below. - min_perf_pct: limits the minimum P state that will be requested by - the driver stated as a percentage of the max (non-turbo) + min_perf_pct: Limits the minimum P-State that will be requested by + the driver. It states it as a percentage of the max (non-turbo) performance level. - no_turbo: limits the driver to selecting P states below the turbo + no_turbo: Limits the driver to selecting P-State below the turbo frequency range. - turbo_pct: displays the percentage of the total performance that - is supported by hardware that is in the turbo range. This number + turbo_pct: Displays the percentage of the total performance that + is supported by hardware that is in the turbo range. This number is independent of whether turbo has been disabled or not. - num_pstates: displays the number of pstates that are supported - by hardware. This number is independent of whether turbo has + num_pstates: Displays the number of P-States that are supported + by hardware. This number is independent of whether turbo has been disabled or not. +For example, if a system has these parameters: + Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State) + Max non turbo ratio: 0x17 + Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio) + +Sysfs will show : + max_perf_pct:100, which corresponds to 1 core ratio + min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio + no_turbo:0, turbo is not disabled + num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1) + turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates + +Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual +Volume 3: System Programming Guide" to understand ratios. + +cpufreq sysfs for Intel P-State + +Since this driver registers with cpufreq, cpufreq sysfs is also presented. +There are some important differences, which need to be considered. + +scaling_cur_freq: This displays the real frequency which was used during +the last sample period instead of what is requested. Some other cpufreq driver, +like acpi-cpufreq, displays what is requested (Some changes are on the +way to fix this for acpi-cpufreq driver). The same is true for frequencies +displayed at /proc/cpuinfo. + +scaling_governor: This displays current active policy. Since each CPU has a +cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this +is not possible with Intel P-States, as there is one common policy for all +CPUs. Here, the last requested policy will be applicable to all CPUs. It is +suggested that one use the cpupower utility to change policy to all CPUs at the +same time. + +scaling_setspeed: This attribute can never be used with Intel P-State. + +scaling_max_freq/scaling_min_freq: This interface can be used similarly to +the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies +are converted to nearest possible P-State, this is prone to rounding errors. +This method is not preferred to limit performance. + +affected_cpus: Not used +related_cpus: Not used + For contemporary Intel processors, the frequency is controlled by the -processor itself and the P-states exposed to software are related to +processor itself and the P-State exposed to software is related to performance levels. The idea that frequency can be set to a single -frequency is fiction for Intel Core processors. Even if the scaling -driver selects a single P state the actual frequency the processor +frequency is fictional for Intel Core processors. Even if the scaling +driver selects a single P-State, the actual frequency the processor will run at is selected by the processor itself. -For legacy mode debugfs files have also been added to allow tuning of -the internal governor algorythm. These files are located at -/sys/kernel/debug/pstate_snb/ These files are NOT present in HWP mode. +Tuning Intel P-State driver + +When HWP mode is not used, debugfs files have also been added to allow the +tuning of the internal governor algorithm. These files are located at +/sys/kernel/debug/pstate_snb/. The algorithm uses a PID (Proportional +Integral Derivative) controller. The PID tunable parameters are: deadband d_gain_pct @@ -63,3 +133,90 @@ the internal governor algorythm. These files are located at p_gain_pct sample_rate_ms setpoint + +To adjust these parameters, some understanding of driver implementation is +necessary. There are some tweeks described here, but be very careful. Adjusting +them requires expert level understanding of power and performance relationship. +These limits are only useful when the "powersave" policy is active. + +-To make the system more responsive to load changes, sample_rate_ms can +be adjusted (current default is 10ms). +-To make the system use higher performance, even if the load is lower, setpoint +can be adjusted to a lower number. This will also lead to faster ramp up time +to reach the maximum P-State. +If there are no derivative and integral coefficients, The next P-State will be +equal to: + current P-State - ((setpoint - current cpu load) * p_gain_pct) + +For example, if the current PID parameters are (Which are defaults for the core +processors like SandyBridge): + deadband = 0 + d_gain_pct = 0 + i_gain_pct = 0 + p_gain_pct = 20 + sample_rate_ms = 10 + setpoint = 97 + +If the current P-State = 0x08 and current load = 100, this will result in the +next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State +goes up by only 1. If during next sample interval the current load doesn't +change and still 100, then P-State goes up by one again. This process will +continue as long as the load is more than the setpoint until the maximum P-State +is reached. + +For the same load at setpoint = 60, this will result in the next P-State += 0x08 - ((60 - 100) * 0.2) = 16 +So by changing the setpoint from 97 to 60, there is an increase of the +next P-State from 9 to 16. So this will make processor execute at higher +P-State for the same CPU load. If the load continues to be more than the +setpoint during next sample intervals, then P-State will go up again till the +maximum P-State is reached. But the ramp up time to reach the maximum P-State +will be much faster when the setpoint is 60 compared to 97. + +Debugging Intel P-State driver + +Event tracing +To debug P-State transition, the Linux event tracing interface can be used. +There are two specific events, which can be enabled (Provided the kernel +configs related to event tracing are enabled). + +# cd /sys/kernel/debug/tracing/ +# echo 1 > events/power/pstate_sample/enable +# echo 1 > events/power/cpu_frequency/enable +# cat trace +gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 + scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 + freq=2474476 +cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2 + + +Using ftrace + +If function level tracing is required, the Linux ftrace interface can be used. +For example if we want to check how often a function to set a P-State is +called, we can set ftrace filter to intel_pstate_set_pstate. + +# cd /sys/kernel/debug/tracing/ +# cat available_filter_functions | grep -i pstate +intel_pstate_set_pstate +intel_pstate_cpu_init +... + +# echo intel_pstate_set_pstate > set_ftrace_filter +# echo function > current_tracer +# cat trace | head -15 +# tracer: function +# +# entries-in-buffer/entries-written: 80/80 #P:4 +# +# _-----=> irqs-off +# / _----=> need-resched +# | / _---=> hardirq/softirq +# || / _--=> preempt-depth +# ||| / delay +# TASK-PID CPU# |||| TIMESTAMP FUNCTION +# | | | |||| | | + Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func + gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func + gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func + -0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt index 9e3c3b335..0a94224ad 100644 --- a/Documentation/cpu-freq/pcc-cpufreq.txt +++ b/Documentation/cpu-freq/pcc-cpufreq.txt @@ -159,8 +159,8 @@ to be strictly associated with a P-state. 2.2 cpuinfo_transition_latency: ------------------------------- -The cpuinfo_transition_latency field is 0. The PCC specification does -not include a field to expose this value currently. +The cpuinfo_transition_latency field is CPUFREQ_ETERNAL. The PCC specification +does not include a field to expose this value currently. 2.3 cpuinfo_cur_freq: --------------------- diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index f9ad5e048..dd68821c2 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -150,7 +150,7 @@ an entry as shown below in the output. If this is not mounted, do the following. - #mkdir /sysfs + #mkdir /sys #mount -t sysfs sys /sys Now you should see entries for all present cpu, the following is an example diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index e15bc1a0f..89fd8f9a2 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -18,11 +18,11 @@ Construction Parameters 0 is the original format used in the Chromium OS. The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeros. + the rest of the block is padded with zeroes. 1 is the current format that should be used for new devices. The salt is prepended when hashing and each digest is - padded with zeros to the power of two. + padded with zeroes to the power of two. This is the device containing data, the integrity of which needs to be @@ -79,6 +79,37 @@ restart_on_corruption not compatible with ignore_corruption and requires user space support to avoid restart loops. +ignore_zero_blocks + Do not verify blocks that are expected to contain zeroes and always return + zeroes instead. This may be useful if the partition contains unused blocks + that are not guaranteed to contain zeroes. + +use_fec_from_device + Use forward error correction (FEC) to recover from corruption if hash + verification fails. Use encoding data from the specified device. This + may be the same device where data and hash blocks reside, in which case + fec_start must be outside data and hash areas. + + If the encoding data covers additional metadata, it must be accessible + on the hash device after the hash blocks. + + Note: block sizes for data and hash devices must match. Also, if the + verity is encrypted the should be too. + +fec_roots + Number of generator roots. This equals to the number of parity bytes in + the encoding data. For example, in RS(M, N) encoding, the number of roots + is M-N. + +fec_blocks + The number of encoding data blocks on the FEC device. The block size for + the FEC device is . + +fec_start + This is the offset, in blocks, from the start of the + FEC device to the beginning of the encoding data. + + Theory of operation =================== @@ -98,6 +129,11 @@ per-block basis. This allows for a lightweight hash computation on first read into the page cache. Block hashes are stored linearly, aligned to the nearest block size. +If forward error correction (FEC) support is enabled any recovery of +corrupted data will be verified using the cryptographic hash of the +corresponding data. This is why combining error correction with +integrity checking is essential. + Hash Tree --------- diff --git a/Documentation/devicetree/bindings/arm/arm,scpi.txt b/Documentation/devicetree/bindings/arm/arm,scpi.txt index 86302de67..313dabdc1 100644 --- a/Documentation/devicetree/bindings/arm/arm,scpi.txt +++ b/Documentation/devicetree/bindings/arm/arm,scpi.txt @@ -63,7 +63,7 @@ Required properties: - compatible : should be "arm,juno-sram-ns" for Non-secure SRAM on Juno The rest of the properties should follow the generic mmio-sram description -found in ../../misc/sysram.txt +found in ../../sram/sram.txt Each sub-node represents the reserved area for SCPI. diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt index c78576bb7..11d3056dc 100644 --- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt +++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt @@ -26,6 +26,10 @@ Raspberry Pi Model B+ Required root node properties: compatible = "raspberrypi,model-b-plus", "brcm,bcm2835"; +Raspberry Pi 2 Model B +Required root node properties: +compatible = "raspberrypi,2-model-b", "brcm,bcm2836"; + Raspberry Pi Compute Module Required root node properties: compatible = "raspberrypi,compute-module", "brcm,bcm2835"; diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.txt index 6b0f49f6f..8608a776c 100644 --- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.txt +++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.txt @@ -5,4 +5,11 @@ Boards with the BCM4708 SoC shall have the following properties: Required root node property: +bcm4708 compatible = "brcm,bcm4708"; + +bcm4709 +compatible = "brcm,bcm4709"; + +bcm53012 +compatible = "brcm,bcm53012"; diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,nsp-cpu-method.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,nsp-cpu-method.txt new file mode 100644 index 000000000..677ef9d9f --- /dev/null +++ b/Documentation/devicetree/bindings/arm/bcm/brcm,nsp-cpu-method.txt @@ -0,0 +1,39 @@ +Broadcom Northstar Plus SoC CPU Enable Method +--------------------------------------------- +This binding defines the enable method used for starting secondary +CPU in the following Broadcom SoCs: + BCM58522, BCM58525, BCM58535, BCM58622, BCM58623, BCM58625, BCM88312 + +The enable method is specified by defining the following required +properties in the corresponding secondary "cpu" device tree node: + - enable-method = "brcm,bcm-nsp-smp"; + - secondary-boot-reg = <...>; + +The secondary-boot-reg property is a u32 value that specifies the +physical address of the register which should hold the common +entry point for a secondary CPU. This entry is cpu node specific +and should be added per cpu. E.g., in case of NSP (BCM58625) which +is a dual core CPU SoC, this entry should be added to cpu1 node. + + +Example: + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "arm,cortex-a9"; + next-level-cache = <&L2>; + reg = <0>; + }; + + cpu1: cpu@1 { + device_type = "cpu"; + compatible = "arm,cortex-a9"; + next-level-cache = <&L2>; + enable-method = "brcm,bcm-nsp-smp"; + secondary-boot-reg = <0xffff042c>; + reg = <1>; + }; + }; diff --git a/Documentation/devicetree/bindings/arm/compulab-boards.txt b/Documentation/devicetree/bindings/arm/compulab-boards.txt new file mode 100644 index 000000000..42a10285a --- /dev/null +++ b/Documentation/devicetree/bindings/arm/compulab-boards.txt @@ -0,0 +1,25 @@ +CompuLab SB-SOM is a multi-module baseboard capable of carrying: + - CM-T43 + - CM-T54 + - CM-QS600 + - CL-SOM-AM57x + - CL-SOM-iMX7 +modules with minor modifications to the SB-SOM assembly. + +Required root node properties: + - compatible = should be "compulab,sb-som" + +Compulab CL-SOM-iMX7 is a miniature System-on-Module (SoM) based on +Freescale i.MX7 ARM Cortex-A7 System-on-Chip. + +Required root node properties: + - compatible = "compulab,cl-som-imx7", "fsl,imx7d"; + +Compulab SBC-iMX7 is a single board computer based on the +Freescale i.MX7 system-on-chip. SBC-iMX7 is implemented with +the CL-SOM-iMX7 System-on-Module providing most of the functions, +and SB-SOM-iMX7 carrier board providing additional peripheral +functions and connectors. + +Required root node properties: + - compatible = "compulab,sbc-imx7", "compulab,cl-som-imx7", "fsl,imx7d"; diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt index 3a07a87fe..ae9be074d 100644 --- a/Documentation/devicetree/bindings/arm/cpus.txt +++ b/Documentation/devicetree/bindings/arm/cpus.txt @@ -157,6 +157,7 @@ nodes to be present and contain the properties described below. "arm,cortex-a17" "arm,cortex-a53" "arm,cortex-a57" + "arm,cortex-a72" "arm,cortex-m0" "arm,cortex-m0+" "arm,cortex-m1" @@ -190,6 +191,8 @@ nodes to be present and contain the properties described below. "allwinner,sun6i-a31" "allwinner,sun8i-a23" "arm,psci" + "arm,realview-smp" + "brcm,bcm-nsp-smp" "brcm,brahma-b15" "marvell,armada-375-smp" "marvell,armada-380-smp" @@ -200,6 +203,7 @@ nodes to be present and contain the properties described below. "qcom,gcc-msm8660" "qcom,kpss-acc-v1" "qcom,kpss-acc-v2" + "rockchip,rk3036-smp" "rockchip,rk3066-smp" "ste,dbx500-smp" @@ -242,6 +246,23 @@ nodes to be present and contain the properties described below. Definition: Specifies the syscon node controlling the cpu core power domains. + - dynamic-power-coefficient + Usage: optional + Value type: + Definition: A u32 value that represents the running time dynamic + power coefficient in units of mW/MHz/uVolt^2. The + coefficient can either be calculated from power + measurements or derived by analysis. + + The dynamic power consumption of the CPU is + proportional to the square of the Voltage (V) and + the clock frequency (f). The coefficient is used to + calculate the dynamic power as below - + + Pdyn = dynamic-power-coefficient * V^2 * f + + where voltage is in uV, frequency is in MHz. + Example 1 (dual-cluster big.LITTLE system 32-bit): cpus { diff --git a/Documentation/devicetree/bindings/arm/exynos/smp-sysram.txt b/Documentation/devicetree/bindings/arm/exynos/smp-sysram.txt deleted file mode 100644 index 4a0a4f70a..000000000 --- a/Documentation/devicetree/bindings/arm/exynos/smp-sysram.txt +++ /dev/null @@ -1,38 +0,0 @@ -Samsung Exynos SYSRAM for SMP bringup: ------------------------------------- - -Samsung SMP-capable Exynos SoCs use part of the SYSRAM for the bringup -of the secondary cores. Once the core gets powered up it executes the -code that is residing at some specific location of the SYSRAM. - -Therefore reserved section sub-nodes have to be added to the mmio-sram -declaration. These nodes are of two types depending upon secure or -non-secure execution environment. - -Required sub-node properties: -- compatible : depending upon boot mode, should be - "samsung,exynos4210-sysram" : for Secure SYSRAM - "samsung,exynos4210-sysram-ns" : for Non-secure SYSRAM - -The rest of the properties should follow the generic mmio-sram discription -found in ../../misc/sysram.txt - -Example: - - sysram@02020000 { - compatible = "mmio-sram"; - reg = <0x02020000 0x54000>; - #address-cells = <1>; - #size-cells = <1>; - ranges = <0 0x02020000 0x54000>; - - smp-sysram@0 { - compatible = "samsung,exynos4210-sysram"; - reg = <0x0 0x1000>; - }; - - smp-sysram@53000 { - compatible = "samsung,exynos4210-sysram-ns"; - reg = <0x53000 0x1000>; - }; - }; diff --git a/Documentation/devicetree/bindings/arm/fsl.txt b/Documentation/devicetree/bindings/arm/fsl.txt index 34c88b0c7..752a685d9 100644 --- a/Documentation/devicetree/bindings/arm/fsl.txt +++ b/Documentation/devicetree/bindings/arm/fsl.txt @@ -131,6 +131,10 @@ Example: Freescale ARMv8 based Layerscape SoC family Device Tree Bindings ---------------------------------------------------------------- +LS1043A ARMv8 based RDB Board +Required root node properties: + - compatible = "fsl,ls1043a-rdb", "fsl,ls1043a"; + LS2080A ARMv8 based Simulator model Required root node properties: - compatible = "fsl,ls2080a-simu", "fsl,ls2080a"; diff --git a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt index 6ac7c000a..e3ccab114 100644 --- a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt +++ b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt @@ -187,6 +187,22 @@ Example: reg = <0xb0000000 0x10000>; }; +Hisilicon HiP05 PERISUB system controller + +Required properties: +- compatible : "hisilicon,hip05-perisubc", "syscon"; +- reg : Register address and size + +The HiP05 PERISUB system controller is shared by peripheral controllers in +HiP05 Soc to implement some basic configurations. The peripheral +controllers include mdio, ddr, iic, uart, timer and so on. + +Example: + /* for HiP05 perisub-ctrl-c system */ + peri_c_subctrl: syscon@80000000 { + compatible = "hisilicon,hip05-perisubc", "syscon"; + reg = <0x0 0x80000000 0x0 0x10000>; + }; ----------------------------------------------------------------------- Hisilicon CPU controller diff --git a/Documentation/devicetree/bindings/arm/l2c2x0.txt b/Documentation/devicetree/bindings/arm/l2c2x0.txt new file mode 100644 index 000000000..fe0398c5c --- /dev/null +++ b/Documentation/devicetree/bindings/arm/l2c2x0.txt @@ -0,0 +1,105 @@ +* ARM L2 Cache Controller + +ARM cores often have a separate L2C210/L2C220/L2C310 (also known as PL210/PL220/ +PL310 and variants) based level 2 cache controller. All these various implementations +of the L2 cache controller have compatible programming models (Note 1). +Some of the properties that are just prefixed "cache-*" are taken from section +3.7.3 of the ePAPR v1.1 specification which can be found at: +https://www.power.org/wp-content/uploads/2012/06/Power_ePAPR_APPROVED_v1.1.pdf + +The ARM L2 cache representation in the device tree should be done as follows: + +Required properties: + +- compatible : should be one of: + "arm,pl310-cache" + "arm,l220-cache" + "arm,l210-cache" + "bcm,bcm11351-a2-pl310-cache": DEPRECATED by "brcm,bcm11351-a2-pl310-cache" + "brcm,bcm11351-a2-pl310-cache": For Broadcom bcm11351 chipset where an + offset needs to be added to the address before passing down to the L2 + cache controller + "marvell,aurora-system-cache": Marvell Controller designed to be + compatible with the ARM one, with system cache mode (meaning + maintenance operations on L1 are broadcasted to the L2 and L2 + performs the same operation). + "marvell,aurora-outer-cache": Marvell Controller designed to be + compatible with the ARM one with outer cache mode. + "marvell,tauros3-cache": Marvell Tauros3 cache controller, compatible + with arm,pl310-cache controller. +- cache-unified : Specifies the cache is a unified cache. +- cache-level : Should be set to 2 for a level 2 cache. +- reg : Physical base address and size of cache controller's memory mapped + registers. + +Optional properties: + +- arm,data-latency : Cycles of latency for Data RAM accesses. Specifies 3 cells of + read, write and setup latencies. Minimum valid values are 1. Controllers + without setup latency control should use a value of 0. +- arm,tag-latency : Cycles of latency for Tag RAM accesses. Specifies 3 cells of + read, write and setup latencies. Controllers without setup latency control + should use 0. Controllers without separate read and write Tag RAM latency + values should only use the first cell. +- arm,dirty-latency : Cycles of latency for Dirty RAMs. This is a single cell. +- arm,filter-ranges : Starting address and length of window to + filter. Addresses in the filter window are directed to the M1 port. Other + addresses will go to the M0 port. +- arm,io-coherent : indicates that the system is operating in an hardware + I/O coherent mode. Valid only when the arm,pl310-cache compatible + string is used. +- interrupts : 1 combined interrupt. +- cache-size : specifies the size in bytes of the cache +- cache-sets : specifies the number of associativity sets of the cache +- cache-block-size : specifies the size in bytes of a cache block +- cache-line-size : specifies the size in bytes of a line in the cache, + if this is not specified, the line size is assumed to be equal to the + cache block size +- cache-id-part: cache id part number to be used if it is not present + on hardware +- wt-override: If present then L2 is forced to Write through mode +- arm,double-linefill : Override double linefill enable setting. Enable if + non-zero, disable if zero. +- arm,double-linefill-incr : Override double linefill on INCR read. Enable + if non-zero, disable if zero. +- arm,double-linefill-wrap : Override double linefill on WRAP read. Enable + if non-zero, disable if zero. +- arm,prefetch-drop : Override prefetch drop enable setting. Enable if non-zero, + disable if zero. +- arm,prefetch-offset : Override prefetch offset value. Valid values are + 0-7, 15, 23, and 31. +- arm,shared-override : The default behavior of the L220 or PL310 cache + controllers with respect to the shareable attribute is to transform "normal + memory non-cacheable transactions" into "cacheable no allocate" (for reads) + or "write through no write allocate" (for writes). + On systems where this may cause DMA buffer corruption, this property must be + specified to indicate that such transforms are precluded. +- arm,parity-enable : enable parity checking on the L2 cache (L220 or PL310). +- arm,parity-disable : disable parity checking on the L2 cache (L220 or PL310). +- arm,outer-sync-disable : disable the outer sync operation on the L2 cache. + Some core tiles, especially ARM PB11MPCore have a faulty L220 cache that + will randomly hang unless outer sync operations are disabled. +- prefetch-data : Data prefetch. Value: <0> (forcibly disable), <1> + (forcibly enable), property absent (retain settings set by firmware) +- prefetch-instr : Instruction prefetch. Value: <0> (forcibly disable), + <1> (forcibly enable), property absent (retain settings set by + firmware) + +Example: + +L2: cache-controller { + compatible = "arm,pl310-cache"; + reg = <0xfff12000 0x1000>; + arm,data-latency = <1 1 1>; + arm,tag-latency = <2 2 2>; + arm,filter-ranges = <0x80000000 0x8000000>; + cache-unified; + cache-level = <2>; + interrupts = <45>; +}; + +Note 1: The description in this document doesn't apply to integrated L2 + cache controllers as found in e.g. Cortex-A15/A7/A57/A53. These + integrated L2 controllers are assumed to be all preconfigured by + early secure boot code. Thus no need to deal with their configuration + in the kernel at all. diff --git a/Documentation/devicetree/bindings/arm/l2cc.txt b/Documentation/devicetree/bindings/arm/l2cc.txt deleted file mode 100644 index 06c88a4d2..000000000 --- a/Documentation/devicetree/bindings/arm/l2cc.txt +++ /dev/null @@ -1,93 +0,0 @@ -* ARM L2 Cache Controller - -ARM cores often have a separate level 2 cache controller. There are various -implementations of the L2 cache controller with compatible programming models. -Some of the properties that are just prefixed "cache-*" are taken from section -3.7.3 of the ePAPR v1.1 specification which can be found at: -https://www.power.org/wp-content/uploads/2012/06/Power_ePAPR_APPROVED_v1.1.pdf - -The ARM L2 cache representation in the device tree should be done as follows: - -Required properties: - -- compatible : should be one of: - "arm,pl310-cache" - "arm,l220-cache" - "arm,l210-cache" - "bcm,bcm11351-a2-pl310-cache": DEPRECATED by "brcm,bcm11351-a2-pl310-cache" - "brcm,bcm11351-a2-pl310-cache": For Broadcom bcm11351 chipset where an - offset needs to be added to the address before passing down to the L2 - cache controller - "marvell,aurora-system-cache": Marvell Controller designed to be - compatible with the ARM one, with system cache mode (meaning - maintenance operations on L1 are broadcasted to the L2 and L2 - performs the same operation). - "marvell,aurora-outer-cache": Marvell Controller designed to be - compatible with the ARM one with outer cache mode. - "marvell,tauros3-cache": Marvell Tauros3 cache controller, compatible - with arm,pl310-cache controller. -- cache-unified : Specifies the cache is a unified cache. -- cache-level : Should be set to 2 for a level 2 cache. -- reg : Physical base address and size of cache controller's memory mapped - registers. - -Optional properties: - -- arm,data-latency : Cycles of latency for Data RAM accesses. Specifies 3 cells of - read, write and setup latencies. Minimum valid values are 1. Controllers - without setup latency control should use a value of 0. -- arm,tag-latency : Cycles of latency for Tag RAM accesses. Specifies 3 cells of - read, write and setup latencies. Controllers without setup latency control - should use 0. Controllers without separate read and write Tag RAM latency - values should only use the first cell. -- arm,dirty-latency : Cycles of latency for Dirty RAMs. This is a single cell. -- arm,filter-ranges : Starting address and length of window to - filter. Addresses in the filter window are directed to the M1 port. Other - addresses will go to the M0 port. -- arm,io-coherent : indicates that the system is operating in an hardware - I/O coherent mode. Valid only when the arm,pl310-cache compatible - string is used. -- interrupts : 1 combined interrupt. -- cache-size : specifies the size in bytes of the cache -- cache-sets : specifies the number of associativity sets of the cache -- cache-block-size : specifies the size in bytes of a cache block -- cache-line-size : specifies the size in bytes of a line in the cache, - if this is not specified, the line size is assumed to be equal to the - cache block size -- cache-id-part: cache id part number to be used if it is not present - on hardware -- wt-override: If present then L2 is forced to Write through mode -- arm,double-linefill : Override double linefill enable setting. Enable if - non-zero, disable if zero. -- arm,double-linefill-incr : Override double linefill on INCR read. Enable - if non-zero, disable if zero. -- arm,double-linefill-wrap : Override double linefill on WRAP read. Enable - if non-zero, disable if zero. -- arm,prefetch-drop : Override prefetch drop enable setting. Enable if non-zero, - disable if zero. -- arm,prefetch-offset : Override prefetch offset value. Valid values are - 0-7, 15, 23, and 31. -- arm,shared-override : The default behavior of the pl310 cache controller with - respect to the shareable attribute is to transform "normal memory - non-cacheable transactions" into "cacheable no allocate" (for reads) or - "write through no write allocate" (for writes). - On systems where this may cause DMA buffer corruption, this property must be - specified to indicate that such transforms are precluded. -- prefetch-data : Data prefetch. Value: <0> (forcibly disable), <1> - (forcibly enable), property absent (retain settings set by firmware) -- prefetch-instr : Instruction prefetch. Value: <0> (forcibly disable), - <1> (forcibly enable), property absent (retain settings set by - firmware) - -Example: - -L2: cache-controller { - compatible = "arm,pl310-cache"; - reg = <0xfff12000 0x1000>; - arm,data-latency = <1 1 1>; - arm,tag-latency = <2 2 2>; - arm,filter-ranges = <0x80000000 0x8000000>; - cache-unified; - cache-level = <2>; - interrupts = <45>; -}; diff --git a/Documentation/devicetree/bindings/arm/marvell,kirkwood.txt b/Documentation/devicetree/bindings/arm/marvell,kirkwood.txt index 5171ad8f4..ab0c9cdf3 100644 --- a/Documentation/devicetree/bindings/arm/marvell,kirkwood.txt +++ b/Documentation/devicetree/bindings/arm/marvell,kirkwood.txt @@ -24,6 +24,8 @@ board. Currently known boards are: "buffalo,lswxl" "buffalo,lsxhl" "buffalo,lsxl" +"cloudengines,pogo02" +"cloudengines,pogoplugv4" "dlink,dns-320" "dlink,dns-320-a1" "dlink,dns-325" diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt b/Documentation/devicetree/bindings/arm/mediatek.txt index 618a91994..54f43bc2d 100644 --- a/Documentation/devicetree/bindings/arm/mediatek.txt +++ b/Documentation/devicetree/bindings/arm/mediatek.txt @@ -6,6 +6,7 @@ following property: Required root node property: compatible: Must contain one of + "mediatek,mt2701" "mediatek,mt6580" "mediatek,mt6589" "mediatek,mt6592" @@ -17,6 +18,9 @@ compatible: Must contain one of Supported boards: +- Evaluation board for MT2701: + Required root node properties: + - compatible = "mediatek,mt2701-evb", "mediatek,mt2701"; - Evaluation board for MT6580: Required root node properties: - compatible = "mediatek,mt6580-evbp1", "mediatek,mt6580"; diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt index f6cd3e419..aaf8d1460 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt @@ -18,7 +18,7 @@ The available clocks are defined in dt-bindings/clock/mt*-clk.h. Also it uses the common reset controller binding from Documentation/devicetree/bindings/reset/reset.txt. The available reset outputs are defined in -dt-bindings/reset-controller/mt*-resets.h +dt-bindings/reset/mt*-resets.h Example: diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt index f25b85499..2f6ff86df 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt @@ -18,7 +18,7 @@ The available clocks are defined in dt-bindings/clock/mt*-clk.h. Also it uses the common reset controller binding from Documentation/devicetree/bindings/reset/reset.txt. The available reset outputs are defined in -dt-bindings/reset-controller/mt*-resets.h +dt-bindings/reset/mt*-resets.h Example: diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt index 9f4e5136e..66422d663 100644 --- a/Documentation/devicetree/bindings/arm/omap/omap.txt +++ b/Documentation/devicetree/bindings/arm/omap/omap.txt @@ -23,6 +23,7 @@ Optional properties: during suspend. - ti,no-reset-on-init: When present, the module should not be reset at init - ti,no-idle-on-init: When present, the module should not be idled at init +- ti,no-idle: When present, the module is never allowed to idle. Example: @@ -138,9 +139,21 @@ Boards: - AM335X phyBOARD-WEGA: Single Board Computer dev kit compatible = "phytec,am335x-wega", "phytec,am335x-phycore-som", "ti,am33xx" +- AM335X CM-T335 : System On Module, built around the Sitara AM3352/4 + compatible = "compulab,cm-t335", "ti,am33xx" + +- AM335X SBC-T335 : single board computer, built around the Sitara AM3352/4 + compatible = "compulab,sbc-t335", "compulab,cm-t335", "ti,am33xx" + - OMAP5 EVM : Evaluation Module compatible = "ti,omap5-evm", "ti,omap5" +- AM437x CM-T43 + compatible = "compulab,am437x-cm-t43", "ti,am4372", "ti,am43" + +- AM437x SBC-T43 + compatible = "compulab,am437x-sbc-t43", "compulab,am437x-cm-t43", "ti,am4372", "ti,am43" + - AM43x EPOS EVM compatible = "ti,am43x-epos-evm", "ti,am4372", "ti,am43" @@ -150,6 +163,12 @@ Boards: - AM437x SK EVM: AM437x StarterKit Evaluation Module compatible = "ti,am437x-sk-evm", "ti,am4372", "ti,am43" +- AM57XX CL-SOM-AM57x + compatible = "compulab,cl-som-am57x", "ti,am5728", "ti,dra742", "ti,dra74", "ti,dra7" + +- AM57XX SBC-AM57x + compatible = "compulab,sbc-am57x", "compulab,cl-som-am57x", "ti,am5728", "ti,dra742", "ti,dra74", "ti,dra7" + - DRA742 EVM: Software Development Board for DRA742 compatible = "ti,dra7-evm", "ti,dra742", "ti,dra74", "ti,dra7" diff --git a/Documentation/devicetree/bindings/arm/pmu.txt b/Documentation/devicetree/bindings/arm/pmu.txt index 97ba45af0..56518839f 100644 --- a/Documentation/devicetree/bindings/arm/pmu.txt +++ b/Documentation/devicetree/bindings/arm/pmu.txt @@ -9,8 +9,9 @@ Required properties: - compatible : should be one of "apm,potenza-pmu" "arm,armv8-pmuv3" - "arm.cortex-a57-pmu" - "arm.cortex-a53-pmu" + "arm,cortex-a72-pmu" + "arm,cortex-a57-pmu" + "arm,cortex-a53-pmu" "arm,cortex-a17-pmu" "arm,cortex-a15-pmu" "arm,cortex-a12-pmu" diff --git a/Documentation/devicetree/bindings/arm/psci.txt b/Documentation/devicetree/bindings/arm/psci.txt index a9adab84e..a2c4f1d52 100644 --- a/Documentation/devicetree/bindings/arm/psci.txt +++ b/Documentation/devicetree/bindings/arm/psci.txt @@ -23,17 +23,20 @@ Main node required properties: - compatible : should contain at least one of: - * "arm,psci" : for implementations complying to PSCI versions prior to - 0.2. For these cases function IDs must be provided. - - * "arm,psci-0.2" : for implementations complying to PSCI 0.2. Function - IDs are not required and should be ignored by an OS with PSCI 0.2 - support, but are permitted to be present for compatibility with - existing software when "arm,psci" is later in the compatible list. - - * "arm,psci-1.0" : for implementations complying to PSCI 1.0. PSCI 1.0 is - backward compatible with PSCI 0.2 with minor specification updates, - as defined in the PSCI specification[2]. + * "arm,psci" : For implementations complying to PSCI versions prior + to 0.2. + For these cases function IDs must be provided. + + * "arm,psci-0.2" : For implementations complying to PSCI 0.2. + Function IDs are not required and should be ignored by + an OS with PSCI 0.2 support, but are permitted to be + present for compatibility with existing software when + "arm,psci" is later in the compatible list. + + * "arm,psci-1.0" : For implementations complying to PSCI 1.0. + PSCI 1.0 is backward compatible with PSCI 0.2 with + minor specification updates, as defined in the PSCI + specification[2]. - method : The method of calling the PSCI firmware. Permitted values are: diff --git a/Documentation/devicetree/bindings/arm/rockchip.txt b/Documentation/devicetree/bindings/arm/rockchip.txt index 8e985dd2f..078c14fcd 100644 --- a/Documentation/devicetree/bindings/arm/rockchip.txt +++ b/Documentation/devicetree/bindings/arm/rockchip.txt @@ -1,6 +1,10 @@ Rockchip platforms device tree bindings --------------------------------------- +- Kylin RK3036 board: + Required root node properties: + - compatible = "rockchip,kylin-rk3036", "rockchip,rk3036"; + - MarsBoard RK3066 board: Required root node properties: - compatible = "haoyu,marsboard-rk3066", "rockchip,rk3066a"; @@ -35,6 +39,11 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "netxeon,r89", "rockchip,rk3288"; +- Google Brain (dev-board): + Required root node properties: + - compatible = "google,veyron-brain-rev0", "google,veyron-brain", + "google,veyron", "rockchip,rk3288"; + - Google Jaq (Haier Chromebook 11 and more): Required root node properties: - compatible = "google,veyron-jaq-rev5", "google,veyron-jaq-rev4", @@ -49,6 +58,15 @@ Rockchip platforms device tree bindings "google,veyron-jerry-rev3", "google,veyron-jerry", "google,veyron", "rockchip,rk3288"; +- Google Mickey (Asus Chromebit CS10): + Required root node properties: + - compatible = "google,veyron-mickey-rev8", "google,veyron-mickey-rev7", + "google,veyron-mickey-rev6", "google,veyron-mickey-rev5", + "google,veyron-mickey-rev4", "google,veyron-mickey-rev3", + "google,veyron-mickey-rev2", "google,veyron-mickey-rev1", + "google,veyron-mickey-rev0", "google,veyron-mickey", + "google,veyron", "rockchip,rk3288"; + - Google Minnie (Asus Chromebook Flip C100P): Required root node properties: - compatible = "google,veyron-minnie-rev4", "google,veyron-minnie-rev3", @@ -69,6 +87,14 @@ Rockchip platforms device tree bindings "google,veyron-speedy-rev3", "google,veyron-speedy-rev2", "google,veyron-speedy", "google,veyron", "rockchip,rk3288"; +- Rockchip RK3368 evb: + Required root node properties: + - compatible = "rockchip,rk3368-evb-act8846", "rockchip,rk3368"; + - Rockchip R88 board: Required root node properties: - compatible = "rockchip,r88", "rockchip,rk3368"; + +- Rockchip RK3228 Evaluation board: + Required root node properties: + - compatible = "rockchip,rk3228-evb", "rockchip,rk3228"; diff --git a/Documentation/devicetree/bindings/arm/rockchip/pmu-sram.txt b/Documentation/devicetree/bindings/arm/rockchip/pmu-sram.txt deleted file mode 100644 index 6b42fda30..000000000 --- a/Documentation/devicetree/bindings/arm/rockchip/pmu-sram.txt +++ /dev/null @@ -1,16 +0,0 @@ -Rockchip SRAM for pmu: ------------------------------- - -The sram of pmu is used to store the function of resume from maskrom(the 1st -level loader). This is a common use of the "pmu-sram" because it keeps power -even in low power states in the system. - -Required node properties: -- compatible : should be "rockchip,rk3288-pmu-sram" -- reg : physical base address and the size of the registers window - -Example: - sram@ff720000 { - compatible = "rockchip,rk3288-pmu-sram", "mmio-sram"; - reg = <0xff720000 0x1000>; - }; diff --git a/Documentation/devicetree/bindings/arm/rockchip/smp-sram.txt b/Documentation/devicetree/bindings/arm/rockchip/smp-sram.txt deleted file mode 100644 index d9416fb8d..000000000 --- a/Documentation/devicetree/bindings/arm/rockchip/smp-sram.txt +++ /dev/null @@ -1,30 +0,0 @@ -Rockchip SRAM for smp bringup: ------------------------------- - -Rockchip's smp-capable SoCs use the first part of the sram for the bringup -of the cores. Once the core gets powered up it executes the code that is -residing at the very beginning of the sram. - -Therefore a reserved section sub-node has to be added to the mmio-sram -declaration. - -Required sub-node properties: -- compatible : should be "rockchip,rk3066-smp-sram" - -The rest of the properties should follow the generic mmio-sram discription -found in ../../misc/sram.txt - -Example: - - sram: sram@10080000 { - compatible = "mmio-sram"; - reg = <0x10080000 0x10000>; - #address-cells = <1>; - #size-cells = <1>; - ranges; - - smp-sram@10080000 { - compatible = "rockchip,rk3066-smp-sram"; - reg = <0x10080000 0x50>; - }; - }; diff --git a/Documentation/devicetree/bindings/arm/samsung/exynos-adc.txt b/Documentation/devicetree/bindings/arm/samsung/exynos-adc.txt index f46ca9a31..ccaaec601 100644 --- a/Documentation/devicetree/bindings/arm/samsung/exynos-adc.txt +++ b/Documentation/devicetree/bindings/arm/samsung/exynos-adc.txt @@ -47,6 +47,9 @@ Required properties: - samsung,syscon-phandle Contains the PMU system controller node (To access the ADC_PHY register on Exynos5250/5420/5800/3250) +Optional properties: +- has-touchscreen: If present, indicates that a touchscreen is + connected an usable. Note: child nodes can be added for auto probing from device tree. diff --git a/Documentation/devicetree/bindings/arm/scu.txt b/Documentation/devicetree/bindings/arm/scu.txt index c44768051..08a587875 100644 --- a/Documentation/devicetree/bindings/arm/scu.txt +++ b/Documentation/devicetree/bindings/arm/scu.txt @@ -10,10 +10,13 @@ References: Revision r2p0 - Cortex-A5: see DDI0434B Cortex-A5 MPCore Technical Reference Manual Revision r0p1 +- ARM11 MPCore: see DDI0360F ARM 11 MPCore Processor Technical Reference + Manial Revision r2p0 - compatible : Should be: "arm,cortex-a9-scu" "arm,cortex-a5-scu" + "arm,arm11mp-scu" - reg : Specify the base address and the size of the SCU register window. diff --git a/Documentation/devicetree/bindings/arm/secure.txt b/Documentation/devicetree/bindings/arm/secure.txt new file mode 100644 index 000000000..e31303fb2 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/secure.txt @@ -0,0 +1,53 @@ +* ARM Secure world bindings + +ARM CPUs with TrustZone support have two distinct address spaces, +"Normal" and "Secure". Most devicetree consumers (including the Linux +kernel) are not TrustZone aware and run entirely in either the Normal +world or the Secure world. However some devicetree consumers are +TrustZone aware and need to be able to determine whether devices are +visible only in the Secure address space, only in the Normal address +space, or visible in both. (One example of that situation would be a +virtual machine which boots Secure firmware and wants to tell the +firmware about the layout of the machine via devicetree.) + +The general principle of the naming scheme for Secure world bindings +is that any property that needs a different value in the Secure world +can be supported by prefixing the property name with "secure-". So for +instance "secure-foo" would override "foo". For property names with +a vendor prefix, the Secure variant of "vendor,foo" would be +"vendor,secure-foo". If there is no "secure-" property then the Secure +world value is the same as specified for the Normal world by the +non-prefixed property. However, only the properties listed below may +validly have "secure-" versions; this list will be enlarged on a +case-by-case basis. + +Defining the bindings in this way means that a device tree which has +been annotated to indicate the presence of Secure-only devices can +still be processed unmodified by existing Non-secure software (and in +particular by the kernel). + +Note that it is still valid for bindings intended for purely Secure +world consumers (like kernels that run entirely in Secure) to simply +describe the view of Secure world using the standard bindings. These +secure- bindings only need to be used where both the Secure and Normal +world views need to be described in a single device tree. + +Valid Secure world properties: + +- secure-status : specifies whether the device is present and usable + in the secure world. The combination of this with "status" allows + the various possible combinations of device visibility to be + specified. If "secure-status" is not specified it defaults to the + same value as "status"; if "status" is not specified either then + both default to "okay". This means the following combinations are + possible: + + /* Neither specified: default to visible in both S and NS */ + secure-status = "okay"; /* visible in both */ + status = "okay"; /* visible in both */ + status = "okay"; secure-status = "okay"; /* visible in both */ + secure-status = "disabled"; /* NS-only */ + status = "okay"; secure-status = "disabled"; /* NS-only */ + status = "disabled"; secure-status = "okay"; /* S-only */ + status = "disabled"; /* disabled in both */ + status = "disabled"; secure-status = "disabled"; /* disabled in both */ diff --git a/Documentation/devicetree/bindings/arm/shmobile.txt b/Documentation/devicetree/bindings/arm/shmobile.txt index 40bb9007c..9cf67e48f 100644 --- a/Documentation/devicetree/bindings/arm/shmobile.txt +++ b/Documentation/devicetree/bindings/arm/shmobile.txt @@ -27,6 +27,8 @@ SoCs: compatible = "renesas,r8a7793" - R-Car E2 (R8A77940) compatible = "renesas,r8a7794" + - R-Car H3 (R8A77950) + compatible = "renesas,r8a7795" Boards: @@ -57,5 +59,7 @@ Boards: compatible = "renesas,marzen", "renesas,r8a7779" - Porter (M2-LCDP) compatible = "renesas,porter", "renesas,r8a7791" + - Salvator-X (RTP0RC7795SIPB0010S) + compatible = "renesas,salvator-x", "renesas,r8a7795"; - SILK (RTP0RC7794LCB00011S) compatible = "renesas,silk", "renesas,r8a7794" diff --git a/Documentation/devicetree/bindings/arm/technologic.txt b/Documentation/devicetree/bindings/arm/technologic.txt new file mode 100644 index 000000000..842298894 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/technologic.txt @@ -0,0 +1,6 @@ +Technologic Systems Platforms Device Tree Bindings +-------------------------------------------------- + +TS-4800 board +Required root node properties: + - compatible = "technologic,imx51-ts4800", "fsl,imx51"; diff --git a/Documentation/devicetree/bindings/ata/brcm,sata-brcmstb.txt b/Documentation/devicetree/bindings/ata/brcm,sata-brcmstb.txt index 20ac9bbfa..60872838f 100644 --- a/Documentation/devicetree/bindings/ata/brcm,sata-brcmstb.txt +++ b/Documentation/devicetree/bindings/ata/brcm,sata-brcmstb.txt @@ -4,7 +4,9 @@ SATA nodes are defined to describe on-chip Serial ATA controllers. Each SATA controller should have its own node. Required properties: -- compatible : compatible list, may contain "brcm,bcm7445-ahci" and/or +- compatible : should be one or more of + "brcm,bcm7425-ahci" + "brcm,bcm7445-ahci" "brcm,sata3-ahci" - reg : register mappings for AHCI and SATA_TOP_CTRL - reg-names : "ahci" and "top-ctrl" diff --git a/Documentation/devicetree/bindings/ata/sata_rcar.txt b/Documentation/devicetree/bindings/ata/sata_rcar.txt index 2493a5a31..0764f9ab6 100644 --- a/Documentation/devicetree/bindings/ata/sata_rcar.txt +++ b/Documentation/devicetree/bindings/ata/sata_rcar.txt @@ -8,6 +8,7 @@ Required properties: - "renesas,sata-r8a7790" for R-Car H2 other than ES1 - "renesas,sata-r8a7791" for R-Car M2-W - "renesas,sata-r8a7793" for R-Car M2-N + - "renesas,sata-r8a7795" for R-Car H3 - reg : address and length of the SATA registers; - interrupts : must consist of one interrupt specifier. - clocks : must contain a reference to the functional clock. diff --git a/Documentation/devicetree/bindings/bus/uniphier-system-bus.txt b/Documentation/devicetree/bindings/bus/uniphier-system-bus.txt new file mode 100644 index 000000000..68ef80aff --- /dev/null +++ b/Documentation/devicetree/bindings/bus/uniphier-system-bus.txt @@ -0,0 +1,66 @@ +UniPhier System Bus + +The UniPhier System Bus is an external bus that connects on-board devices to +the UniPhier SoC. It is a simple (semi-)parallel bus with address, data, and +some control signals. It supports up to 8 banks (chip selects). + +Before any access to the bus, the bus controller must be configured; the bus +controller registers provide the control for the translation from the offset +within each bank to the CPU-viewed address. The needed setup includes the base +address, the size of each bank. Optionally, some timing parameters can be +optimized for faster bus access. + +Required properties: +- compatible: should be "socionext,uniphier-system-bus". +- reg: offset and length of the register set for the bus controller device. +- #address-cells: should be 2. The first cell is the bank number (chip select). + The second cell is the address offset within the bank. +- #size-cells: should be 1. +- ranges: should provide a proper address translation from the System Bus to + the parent bus. + +Note: +The address region(s) that can be assigned for the System Bus is implementation +defined. Some SoCs can use 0x00000000-0x0fffffff and 0x40000000-0x4fffffff, +while other SoCs can only use 0x40000000-0x4fffffff. There might be additional +limitations depending on SoCs and the boot mode. The address translation is +arbitrary as long as the banks are assigned in the supported address space with +the required alignment and they do not overlap one another. +For example, it is possible to map: + bank 0 to 0x42000000-0x43ffffff, bank 5 to 0x46000000-0x46ffffff +It is also possible to map: + bank 0 to 0x48000000-0x49ffffff, bank 5 to 0x44000000-0x44ffffff +There is no reason to stick to a particular translation mapping, but the +"ranges" property should provide a "reasonable" default that is known to work. +The software should initialize the bus controller according to it. + +Example: + + system-bus { + compatible = "socionext,uniphier-system-bus"; + reg = <0x58c00000 0x400>; + #address-cells = <2>; + #size-cells = <1>; + ranges = <1 0x00000000 0x42000000 0x02000000 + 5 0x00000000 0x46000000 0x01000000>; + + ethernet@1,01f00000 { + compatible = "smsc,lan9115"; + reg = <1 0x01f00000 0x1000>; + interrupts = <0 48 4> + phy-mode = "mii"; + }; + + uart@5,00200000 { + compatible = "ns16550a"; + reg = <5 0x00200000 0x20>; + interrupts = <0 49 4> + clock-frequency = <12288000>; + }; + }; + +In this example, + - the Ethernet device is connected at the offset 0x01f00000 of CS1 and + mapped to 0x43f00000 of the parent bus. + - the UART device is connected at the offset 0x00200000 of CS5 and + mapped to 0x46200000 of the parent bus. diff --git a/Documentation/devicetree/bindings/clock/arm-syscon-icst.txt b/Documentation/devicetree/bindings/clock/arm-syscon-icst.txt new file mode 100644 index 000000000..8b7177cec --- /dev/null +++ b/Documentation/devicetree/bindings/clock/arm-syscon-icst.txt @@ -0,0 +1,40 @@ +ARM System Controller ICST clocks + +The ICS525 and ICS307 oscillators are produced by Integrated Devices +Technology (IDT). ARM integrated these oscillators deeply into their +reference designs by adding special control registers that manage such +oscillators to their system controllers. + +The ARM system controller contains logic to serialize and initialize +an ICST clock request after a write to the 32 bit register at an offset +into the system controller. Furthermore, to even be able to alter one of +these frequencies, the system controller must first be unlocked by +writing a special token to another offset in the system controller. + +The ICST oscillator must be provided inside a system controller node. + +Required properties: +- lock-offset: the offset address into the system controller where the + unlocking register is located +- vco-offset: the offset address into the system controller where the + ICST control register is located (even 32 bit address) +- compatible: must be one of "arm,syscon-icst525" or "arm,syscon-icst307" +- #clock-cells: must be <0> +- clocks: parent clock, since the ICST needs a parent clock to derive its + frequency from, this attribute is compulsory. + +Example: + +syscon: syscon@10000000 { + compatible = "syscon"; + reg = <0x10000000 0x1000>; + + oscclk0: osc0@0c { + compatible = "arm,syscon-icst307"; + #clock-cells = <0>; + lock-offset = <0x20>; + vco-offset = <0x0c>; + clocks = <&xtal24mhz>; + }; + (...) +}; diff --git a/Documentation/devicetree/bindings/clock/brcm,bcm2835-aux-clock.txt b/Documentation/devicetree/bindings/clock/brcm,bcm2835-aux-clock.txt new file mode 100644 index 000000000..7a837d218 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/brcm,bcm2835-aux-clock.txt @@ -0,0 +1,31 @@ +Broadcom BCM2835 auxiliary peripheral support + +This binding uses the common clock binding: + Documentation/devicetree/bindings/clock/clock-bindings.txt + +The auxiliary peripherals (UART, SPI1, and SPI2) have a small register +area controlling clock gating to the peripherals, and providing an IRQ +status register. + +Required properties: +- compatible: Should be "brcm,bcm2835-aux" +- #clock-cells: Should be <1>. The permitted clock-specifier values can be + found in include/dt-bindings/clock/bcm2835-aux.h +- reg: Specifies base physical address and size of the registers +- clocks: The parent clock phandle + +Example: + + clocks: cprman@7e101000 { + compatible = "brcm,bcm2835-cprman"; + #clock-cells = <1>; + reg = <0x7e101000 0x2000>; + clocks = <&clk_osc>; + }; + + aux: aux@0x7e215004 { + compatible = "brcm,bcm2835-aux"; + #clock-cells = <1>; + reg = <0x7e215000 0x8>; + clocks = <&clocks BCM2835_CLOCK_VPU>; + }; diff --git a/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt b/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt index ede65a55e..0b35e71b3 100644 --- a/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt +++ b/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt @@ -208,3 +208,8 @@ These clock IDs are defined in: ch3_unused lcpll_ports 4 BCM_NS2_LCPLL_PORTS_CH3_UNUSED ch4_unused lcpll_ports 5 BCM_NS2_LCPLL_PORTS_CH4_UNUSED ch5_unused lcpll_ports 6 BCM_NS2_LCPLL_PORTS_CH5_UNUSED + +BCM63138 +-------- +PLL and leaf clock compatible strings for BCM63138 are: + "brcm,bcm63138-armpll" diff --git a/Documentation/devicetree/bindings/clock/cs2000-cp.txt b/Documentation/devicetree/bindings/clock/cs2000-cp.txt new file mode 100644 index 000000000..54e6df0be --- /dev/null +++ b/Documentation/devicetree/bindings/clock/cs2000-cp.txt @@ -0,0 +1,22 @@ +CIRRUS LOGIC Fractional-N Clock Synthesizer & Clock Multiplier + +Required properties: + +- compatible: "cirrus,cs2000-cp" +- reg: The chip select number on the I2C bus +- clocks: common clock binding for CLK_IN, XTI/REF_CLK +- clock-names: CLK_IN : clk_in, XTI/REF_CLK : ref_clk +- #clock-cells: must be <0> + +Example: + +&i2c2 { + ... + cs2000: clk_multiplier@4f { + #clock-cells = <0>; + compatible = "cirrus,cs2000-cp"; + reg = <0x4f>; + clocks = <&rcar_sound 0>, <&x12_clk>; + clock-names = "clk_in", "ref_clk"; + }; +}; diff --git a/Documentation/devicetree/bindings/clock/dove-divider-clock.txt b/Documentation/devicetree/bindings/clock/dove-divider-clock.txt new file mode 100644 index 000000000..e3eb0f657 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/dove-divider-clock.txt @@ -0,0 +1,28 @@ +PLL divider based Dove clocks + +Marvell Dove has a 2GHz PLL, which feeds into a set of dividers to provide +high speed clocks for a number of peripherals. These dividers are part of +the PMU, and thus this node should be a child of the PMU node. + +The following clocks are provided: + +ID Clock +------------- +0 AXI bus clock +1 GPU clock +2 VMeta clock +3 LCD clock + +Required properties: +- compatible : shall be "marvell,dove-divider-clock" +- reg : shall be the register address of the Core PLL and Clock Divider + Control 0 register. This will cover that register, as well as the + Core PLL and Clock Divider Control 1 register. Thus, it will have + a size of 8. +- #clock-cells : from common clock binding; shall be set to 1 + +divider_clk: core-clock@0064 { + compatible = "marvell,dove-divider-clock"; + reg = <0x0064 0x8>; + #clock-cells = <1>; +}; diff --git a/Documentation/devicetree/bindings/clock/nvidia,tegra210-car.txt b/Documentation/devicetree/bindings/clock/nvidia,tegra210-car.txt new file mode 100644 index 000000000..26f237f64 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/nvidia,tegra210-car.txt @@ -0,0 +1,56 @@ +NVIDIA Tegra210 Clock And Reset Controller + +This binding uses the common clock binding: +Documentation/devicetree/bindings/clock/clock-bindings.txt + +The CAR (Clock And Reset) Controller on Tegra is the HW module responsible +for muxing and gating Tegra's clocks, and setting their rates. + +Required properties : +- compatible : Should be "nvidia,tegra210-car" +- reg : Should contain CAR registers location and length +- clocks : Should contain phandle and clock specifiers for two clocks: + the 32 KHz "32k_in". +- #clock-cells : Should be 1. + In clock consumers, this cell represents the clock ID exposed by the + CAR. The assignments may be found in header file + . +- #reset-cells : Should be 1. + In clock consumers, this cell represents the bit number in the CAR's + array of CLK_RST_CONTROLLER_RST_DEVICES_* registers. + +Example SoC include file: + +/ { + tegra_car: clock { + compatible = "nvidia,tegra210-car"; + reg = <0x60006000 0x1000>; + #clock-cells = <1>; + #reset-cells = <1>; + }; + + usb@c5004000 { + clocks = <&tegra_car TEGRA210_CLK_USB2>; + }; +}; + +Example board file: + +/ { + clocks { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <0>; + + clk_32k: clock@1 { + compatible = "fixed-clock"; + reg = <1>; + #clock-cells = <0>; + clock-frequency = <32768>; + }; + }; + + &tegra_car { + clocks = <&clk_32k>; + }; +}; diff --git a/Documentation/devicetree/bindings/clock/nxp,lpc3220-clk.txt b/Documentation/devicetree/bindings/clock/nxp,lpc3220-clk.txt new file mode 100644 index 000000000..20cbca3f4 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/nxp,lpc3220-clk.txt @@ -0,0 +1,30 @@ +NXP LPC32xx Clock Controller + +Required properties: +- compatible: should be "nxp,lpc3220-clk" +- reg: should contain clock controller registers location and length +- #clock-cells: must be 1, the cell holds id of a clock provided by the + clock controller +- clocks: phandles of external oscillators, the list must contain one + 32768 Hz oscillator and may have one optional high frequency oscillator +- clock-names: list of external oscillator clock names, must contain + "xtal_32k" and may have optional "xtal" + +Examples: + + /* System Control Block */ + scb { + compatible = "simple-bus"; + ranges = <0x0 0x040004000 0x00001000>; + #address-cells = <1>; + #size-cells = <1>; + + clk: clock-controller@0 { + compatible = "nxp,lpc3220-clk"; + reg = <0x00 0x114>; + #clock-cells = <1>; + + clocks = <&xtal_32k>, <&xtal>; + clock-names = "xtal_32k", "xtal"; + }; + }; diff --git a/Documentation/devicetree/bindings/clock/nxp,lpc3220-usb-clk.txt b/Documentation/devicetree/bindings/clock/nxp,lpc3220-usb-clk.txt new file mode 100644 index 000000000..0aa249409 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/nxp,lpc3220-usb-clk.txt @@ -0,0 +1,22 @@ +NXP LPC32xx USB Clock Controller + +Required properties: +- compatible: should be "nxp,lpc3220-usb-clk" +- reg: should contain clock controller registers location and length +- #clock-cells: must be 1, the cell holds id of a clock provided by the + USB clock controller + +Examples: + + usb { + #address-cells = <1>; + #size-cells = <1>; + compatible = "simple-bus"; + ranges = <0x0 0x31020000 0x00001000>; + + usbclk: clock-controller@f00 { + compatible = "nxp,lpc3220-usb-clk"; + reg = <0xf00 0x100>; + #clock-cells = <1>; + }; + }; diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc.txt b/Documentation/devicetree/bindings/clock/qcom,gcc.txt index 152dfaab2..72f82f444 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc.txt +++ b/Documentation/devicetree/bindings/clock/qcom,gcc.txt @@ -13,6 +13,7 @@ Required properties : "qcom,gcc-msm8974" "qcom,gcc-msm8974pro" "qcom,gcc-msm8974pro-ac" + "qcom,gcc-msm8996" - reg : shall contain base register location and length - #clock-cells : shall contain 1 diff --git a/Documentation/devicetree/bindings/clock/qcom,mmcc.txt b/Documentation/devicetree/bindings/clock/qcom,mmcc.txt index 34e7614d5..8b0f7841a 100644 --- a/Documentation/devicetree/bindings/clock/qcom,mmcc.txt +++ b/Documentation/devicetree/bindings/clock/qcom,mmcc.txt @@ -9,6 +9,7 @@ Required properties : "qcom,mmcc-msm8660" "qcom,mmcc-msm8960" "qcom,mmcc-msm8974" + "qcom,mmcc-msm8996" - reg : shall contain base register location and length - #clock-cells : shall contain 1 diff --git a/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clocks.txt b/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clocks.txt index 38dcf0370..ae36ab842 100644 --- a/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clocks.txt +++ b/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clocks.txt @@ -20,6 +20,10 @@ Required Properties: clocks must be specified. For clocks with multiple parents, invalid settings must be specified as "<0>". - #clock-cells: Must be 0 + + +Optional Properties: + - clock-output-names: The name of the clock as a free-form string diff --git a/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt b/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt index 36c2b5282..399e0da22 100644 --- a/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt +++ b/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt @@ -2,7 +2,7 @@ Required Properties: - - compatible: Must be "renesas,sh73a0-h8300-div-clock" + - compatible: Must be "renesas,h8300-div-clock" - clocks: Reference to the parent clocks ("extal1" and "extal2") diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt new file mode 100644 index 000000000..20df350b9 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt @@ -0,0 +1,56 @@ +* Rockchip RK3036 Clock and Reset Unit + +The RK3036 clock controller generates and supplies clock to various +controllers within the SoC and also implements a reset controller for SoC +peripherals. + +Required Properties: + +- compatible: should be "rockchip,rk3036-cru" +- reg: physical base address of the controller and length of memory mapped + region. +- #clock-cells: should be 1. +- #reset-cells: should be 1. + +Optional Properties: + +- rockchip,grf: phandle to the syscon managing the "general register files" + If missing pll rates are not changeable, due to the missing pll lock status. + +Each clock is assigned an identifier and client nodes can use this identifier +to specify the clock which they consume. All available clocks are defined as +preprocessor macros in the dt-bindings/clock/rk3036-cru.h headers and can be +used in device tree sources. Similar macros exist for the reset sources in +these files. + +External clocks: + +There are several clocks that are generated outside the SoC. It is expected +that they are defined using standard clock bindings with following +clock-output-names: + - "xin24m" - crystal input - required, + - "ext_i2s" - external I2S clock - optional, + - "rmii_clkin" - external EMAC clock - optional + +Example: Clock controller node: + + cru: cru@20000000 { + compatible = "rockchip,rk3036-cru"; + reg = <0x20000000 0x1000>; + rockchip,grf = <&grf>; + + #clock-cells = <1>; + #reset-cells = <1>; + }; + +Example: UART controller node that consumes the clock generated by the clock + controller: + + uart0: serial@20060000 { + compatible = "snps,dw-apb-uart"; + reg = <0x20060000 0x100>; + interrupts = ; + reg-shift = <2>; + reg-io-width = <4>; + clocks = <&cru SCLK_UART0>; + }; diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt new file mode 100644 index 000000000..f32304812 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt @@ -0,0 +1,58 @@ +* Rockchip RK3228 Clock and Reset Unit + +The RK3228 clock controller generates and supplies clock to various +controllers within the SoC and also implements a reset controller for SoC +peripherals. + +Required Properties: + +- compatible: should be "rockchip,rk3228-cru" +- reg: physical base address of the controller and length of memory mapped + region. +- #clock-cells: should be 1. +- #reset-cells: should be 1. + +Optional Properties: + +- rockchip,grf: phandle to the syscon managing the "general register files" + If missing pll rates are not changeable, due to the missing pll lock status. + +Each clock is assigned an identifier and client nodes can use this identifier +to specify the clock which they consume. All available clocks are defined as +preprocessor macros in the dt-bindings/clock/rk3228-cru.h headers and can be +used in device tree sources. Similar macros exist for the reset sources in +these files. + +External clocks: + +There are several clocks that are generated outside the SoC. It is expected +that they are defined using standard clock bindings with following +clock-output-names: + - "xin24m" - crystal input - required, + - "ext_i2s" - external I2S clock - optional, + - "ext_gmac" - external GMAC clock - optional + - "ext_hsadc" - external HSADC clock - optional + - "phy_50m_out" - output clock of the pll in the mac phy + +Example: Clock controller node: + + cru: cru@20000000 { + compatible = "rockchip,rk3228-cru"; + reg = <0x20000000 0x1000>; + rockchip,grf = <&grf>; + + #clock-cells = <1>; + #reset-cells = <1>; + }; + +Example: UART controller node that consumes the clock generated by the clock + controller: + + uart0: serial@10110000 { + compatible = "snps,dw-apb-uart"; + reg = <0x10110000 0x100>; + interrupts = ; + reg-shift = <2>; + reg-io-width = <4>; + clocks = <&cru SCLK_UART0>; + }; diff --git a/Documentation/devicetree/bindings/clock/samsung,s2mps11.txt b/Documentation/devicetree/bindings/clock/samsung,s2mps11.txt new file mode 100644 index 000000000..2726c1d58 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/samsung,s2mps11.txt @@ -0,0 +1,49 @@ +Binding for Samsung S2M and S5M family clock generator block +============================================================ + +This is a part of device tree bindings for S2M and S5M family multi-function +devices. +More information can be found in bindings/mfd/sec-core.txt file. + +The S2MPS11/13/15 and S5M8767 provide three(AP/CP/BT) buffered 32.768 kHz +outputs. The S2MPS14 provides two (AP/BT) buffered 32.768 KHz outputs. + +To register these as clocks with common clock framework instantiate under +main device node a sub-node named "clocks". + +It uses the common clock binding documented in: + - Documentation/devicetree/bindings/clock/clock-bindings.txt + + +Required properties of the "clocks" sub-node: + - #clock-cells: should be 1. + - compatible: Should be one of: "samsung,s2mps11-clk", "samsung,s2mps13-clk", + "samsung,s2mps14-clk", "samsung,s5m8767-clk" + The S2MPS15 uses the same compatible as S2MPS13, as both provides similar + clocks. + + +Each clock is assigned an identifier and client nodes use this identifier +to specify the clock which they consume. + Clock ID Devices + ---------------------------------------------------------- + 32KhzAP 0 S2MPS11/13/14/15, S5M8767 + 32KhzCP 1 S2MPS11/13/15, S5M8767 + 32KhzBT 2 S2MPS11/13/14/15, S5M8767 + +Include dt-bindings/clock/samsung,s2mps11.h file to use preprocessor defines +in device tree sources. + + +Example: + + s2mps11_pmic@66 { + compatible = "samsung,s2mps11-pmic"; + reg = <0x66>; + + s2m_osc: clocks { + compatible = "samsung,s2mps11-clk"; + #clock-cells = <1>; + clock-output-names = "xx", "yy", "zz"; + }; + }; diff --git a/Documentation/devicetree/bindings/clock/sunxi.txt b/Documentation/devicetree/bindings/clock/sunxi.txt index 8a47b77ab..e59f57b24 100644 --- a/Documentation/devicetree/bindings/clock/sunxi.txt +++ b/Documentation/devicetree/bindings/clock/sunxi.txt @@ -27,7 +27,9 @@ Required properties: "allwinner,sun5i-a10s-ahb-gates-clk" - for the AHB gates on A10s "allwinner,sun7i-a20-ahb-gates-clk" - for the AHB gates on A20 "allwinner,sun6i-a31-ar100-clk" - for the AR100 on A31 + "allwinner,sun9i-a80-cpus-clk" - for the CPUS on A80 "allwinner,sun6i-a31-ahb1-clk" - for the AHB1 clock on A31 + "allwinner,sun8i-h3-ahb2-clk" - for the AHB2 clock on H3 "allwinner,sun6i-a31-ahb1-gates-clk" - for the AHB1 gates on A31 "allwinner,sun8i-a23-ahb1-gates-clk" - for the AHB1 gates on A23 "allwinner,sun9i-a80-ahb0-gates-clk" - for the AHB0 gates on A80 @@ -55,6 +57,9 @@ Required properties: "allwinner,sun9i-a80-apb1-gates-clk" - for the APB1 gates on A80 "allwinner,sun6i-a31-apb2-gates-clk" - for the APB2 gates on A31 "allwinner,sun8i-a23-apb2-gates-clk" - for the APB2 gates on A23 + "allwinner,sun8i-h3-bus-gates-clk" - for the bus gates on H3 + "allwinner,sun9i-a80-apbs-gates-clk" - for the APBS gates on A80 + "allwinner,sun4i-a10-dram-gates-clk" - for the DRAM gates on A10 "allwinner,sun5i-a13-mbus-clk" - for the MBUS clock on A13 "allwinner,sun4i-a10-mmc-clk" - for the MMC clock "allwinner,sun9i-a80-mmc-clk" - for mmc module clocks on A80 @@ -68,8 +73,10 @@ Required properties: "allwinner,sun5i-a13-usb-clk" - for usb gates + resets on A13 "allwinner,sun6i-a31-usb-clk" - for usb gates + resets on A31 "allwinner,sun8i-a23-usb-clk" - for usb gates + resets on A23 + "allwinner,sun8i-h3-usb-clk" - for usb gates + resets on H3 "allwinner,sun9i-a80-usb-mod-clk" - for usb gates + resets on A80 "allwinner,sun9i-a80-usb-phy-clk" - for usb phy gates + resets on A80 + "allwinner,sun4i-a10-ve-clk" - for the Video Engine clock Required properties for all clocks: - reg : shall be the control register address for the clock. @@ -89,6 +96,9 @@ Required properties for all clocks: And "allwinner,*-usb-clk" clocks also require: - reset-cells : shall be set to 1 +The "allwinner,sun4i-a10-ve-clk" clock also requires: +- reset-cells : shall be set to 0 + The "allwinner,sun9i-a80-mmc-config-clk" clock also requires: - #reset-cells : shall be set to 1 - resets : shall be the reset control phandle for the mmc block. diff --git a/Documentation/devicetree/bindings/clock/tango4-clock.txt b/Documentation/devicetree/bindings/clock/tango4-clock.txt new file mode 100644 index 000000000..19c580a7b --- /dev/null +++ b/Documentation/devicetree/bindings/clock/tango4-clock.txt @@ -0,0 +1,23 @@ +* Sigma Designs Tango4 Clock Generator + +The Tango4 clock generator outputs cpu_clk and sys_clk (the latter is used +for RAM and various peripheral devices). The clock binding described here +is applicable to all Tango4 SoCs. + +Required Properties: + +- compatible: should be "sigma,tango4-clkgen". +- reg: physical base address of the device and length of memory mapped region. +- clocks: phandle of the input clock (crystal oscillator). +- clock-output-names: should be "cpuclk" and "sysclk". +- #clock-cells: should be set to 1. + +Example: + + clkgen: clkgen@10000 { + compatible = "sigma,tango4-clkgen"; + reg = <0x10000 0x40>; + clocks = <&xtal>; + clock-output-names = "cpuclk", "sysclk"; + #clock-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/cpufreq/arm_big_little_dt.txt b/Documentation/devicetree/bindings/cpufreq/arm_big_little_dt.txt index 0715695e9..2aa06ac0f 100644 --- a/Documentation/devicetree/bindings/cpufreq/arm_big_little_dt.txt +++ b/Documentation/devicetree/bindings/cpufreq/arm_big_little_dt.txt @@ -12,7 +12,7 @@ must be present contiguously. Generic DT driver will check only node 'x' for cpu:x. Required properties: -- operating-points: Refer to Documentation/devicetree/bindings/power/opp.txt +- operating-points: Refer to Documentation/devicetree/bindings/opp/opp.txt for details Optional properties: diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-dt.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-dt.txt index e41c98ffb..dd3929e85 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-dt.txt +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-dt.txt @@ -11,7 +11,7 @@ Required properties: - None Optional properties: -- operating-points: Refer to Documentation/devicetree/bindings/power/opp.txt for +- operating-points: Refer to Documentation/devicetree/bindings/opp/opp.txt for details. OPPs *must* be supplied either via DT, i.e. this property, or populated at runtime. - clock-latency: Specify the possible maximum transition latency for clock, diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt new file mode 100644 index 000000000..d91a02a3b --- /dev/null +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-st.txt @@ -0,0 +1,91 @@ +Binding for ST's CPUFreq driver +=============================== + +ST's CPUFreq driver attempts to read 'process' and 'version' attributes +from the SoC, then supplies the OPP framework with 'prop' and 'supported +hardware' information respectively. The framework is then able to read +the DT and operate in the usual way. + +For more information about the expected DT format [See: ../opp/opp.txt]. + +Frequency Scaling only +---------------------- + +No vendor specific driver required for this. + +Located in CPU's node: + +- operating-points : [See: ../power/opp.txt] + +Example [safe] +-------------- + +cpus { + cpu@0 { + /* kHz uV */ + operating-points = <1500000 0 + 1200000 0 + 800000 0 + 500000 0>; + }; +}; + +Dynamic Voltage and Frequency Scaling (DVFS) +-------------------------------------------- + +This requires the ST CPUFreq driver to supply 'process' and 'version' info. + +Located in CPU's node: + +- operating-points-v2 : [See ../power/opp.txt] + +Example [unsafe] +---------------- + +cpus { + cpu@0 { + operating-points-v2 = <&cpu0_opp_table>; + }; +}; + +cpu0_opp_table: opp_table { + compatible = "operating-points-v2"; + + /* ############################################################### */ + /* # WARNING: Do not attempt to copy/replicate these nodes, # */ + /* # they are only to be supplied by the bootloader !!! # */ + /* ############################################################### */ + opp0 { + /* Major Minor Substrate */ + /* 2 all all */ + opp-supported-hw = <0x00000004 0xffffffff 0xffffffff>; + opp-hz = /bits/ 64 <1500000000>; + clock-latency-ns = <10000000>; + + opp-microvolt-pcode0 = <1200000>; + opp-microvolt-pcode1 = <1200000>; + opp-microvolt-pcode2 = <1200000>; + opp-microvolt-pcode3 = <1200000>; + opp-microvolt-pcode4 = <1170000>; + opp-microvolt-pcode5 = <1140000>; + opp-microvolt-pcode6 = <1100000>; + opp-microvolt-pcode7 = <1070000>; + }; + + opp1 { + /* Major Minor Substrate */ + /* all all all */ + opp-supported-hw = <0xffffffff 0xffffffff 0xffffffff>; + opp-hz = /bits/ 64 <1200000000>; + clock-latency-ns = <10000000>; + + opp-microvolt-pcode0 = <1110000>; + opp-microvolt-pcode1 = <1150000>; + opp-microvolt-pcode2 = <1100000>; + opp-microvolt-pcode3 = <1080000>; + opp-microvolt-pcode4 = <1040000>; + opp-microvolt-pcode5 = <1020000>; + opp-microvolt-pcode6 = <980000>; + opp-microvolt-pcode7 = <930000>; + }; +}; diff --git a/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt b/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt new file mode 100644 index 000000000..096df34b1 --- /dev/null +++ b/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt @@ -0,0 +1,29 @@ +Rockchip Electronics And Security Accelerator + +Required properties: +- compatible: Should be "rockchip,rk3288-crypto" +- reg: Base physical address of the engine and length of memory mapped + region +- interrupts: Interrupt number +- clocks: Reference to the clocks about crypto +- clock-names: "aclk" used to clock data + "hclk" used to clock data + "sclk" used to clock crypto accelerator + "apb_pclk" used to clock dma +- resets: Must contain an entry for each entry in reset-names. + See ../reset/reset.txt for details. +- reset-names: Must include the name "crypto-rst". + +Examples: + + crypto: cypto-controller@ff8a0000 { + compatible = "rockchip,rk3288-crypto"; + reg = <0xff8a0000 0x4000>; + interrupts = ; + clocks = <&cru ACLK_CRYPTO>, <&cru HCLK_CRYPTO>, + <&cru SCLK_CRYPTO>, <&cru ACLK_DMAC1>; + clock-names = "aclk", "hclk", "sclk", "apb_pclk"; + resets = <&cru SRST_CRYPTO>; + reset-names = "crypto-rst"; + status = "okay"; + }; diff --git a/Documentation/devicetree/bindings/display/bridge/tda998x.txt b/Documentation/devicetree/bindings/display/bridge/tda998x.txt index e9e4bce40..e178e6b9f 100644 --- a/Documentation/devicetree/bindings/display/bridge/tda998x.txt +++ b/Documentation/devicetree/bindings/display/bridge/tda998x.txt @@ -5,6 +5,10 @@ Required properties; - reg: I2C address +Required node: + - port: Input port node with endpoint definition, as described + in Documentation/devicetree/bindings/graph.txt + Optional properties: - interrupts: interrupt number and trigger type default: polling diff --git a/Documentation/devicetree/bindings/display/etnaviv/etnaviv-drm.txt b/Documentation/devicetree/bindings/display/etnaviv/etnaviv-drm.txt new file mode 100644 index 000000000..ed5e0a789 --- /dev/null +++ b/Documentation/devicetree/bindings/display/etnaviv/etnaviv-drm.txt @@ -0,0 +1,54 @@ +Etnaviv DRM master device +========================= + +The Etnaviv DRM master device is a virtual device needed to list all +Vivante GPU cores that comprise the GPU subsystem. + +Required properties: +- compatible: Should be one of + "fsl,imx-gpu-subsystem" + "marvell,dove-gpu-subsystem" +- cores: Should contain a list of phandles pointing to Vivante GPU devices + +example: + +gpu-subsystem { + compatible = "fsl,imx-gpu-subsystem"; + cores = <&gpu_2d>, <&gpu_3d>; +}; + + +Vivante GPU core devices +======================== + +Required properties: +- compatible: Should be "vivante,gc" + A more specific compatible is not needed, as the cores contain chip + identification registers at fixed locations, which provide all the + necessary information to the driver. +- reg: should be register base and length as documented in the + datasheet +- interrupts: Should contain the cores interrupt line +- clocks: should contain one clock for entry in clock-names + see Documentation/devicetree/bindings/clock/clock-bindings.txt +- clock-names: + - "bus": AXI/register clock + - "core": GPU core clock + - "shader": Shader clock (only required if GPU has feature PIPE_3D) + +Optional properties: +- power-domains: a power domain consumer specifier according to + Documentation/devicetree/bindings/power/power_domain.txt + +example: + +gpu_3d: gpu@00130000 { + compatible = "vivante,gc"; + reg = <0x00130000 0x4000>; + interrupts = <0 9 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clks IMX6QDL_CLK_GPU3D_AXI>, + <&clks IMX6QDL_CLK_GPU3D_CORE>, + <&clks IMX6QDL_CLK_GPU3D_SHADER>; + clock-names = "bus", "core", "shader"; + power-domains = <&gpc 1>; +}; diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt b/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt index 64693f2eb..fe4a7a2de 100644 --- a/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt +++ b/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt @@ -1,3 +1,20 @@ +Device-Tree bindings for Samsung Exynos Embedded DisplayPort Transmitter(eDP) + +DisplayPort is industry standard to accommodate the growing board adoption +of digital display technology within the PC and CE industries. +It consolidates the internal and external connection methods to reduce device +complexity and cost. It also supports necessary features for important cross +industry applications and provides performance scalability to enable the next +generation of displays that feature higher color depths, refresh rates, and +display resolutions. + +eDP (embedded display port) device is compliant with Embedded DisplayPort +standard as follows, +- DisplayPort standard 1.1a for Exynos5250 and Exynos5260. +- DisplayPort standard 1.3 for Exynos5422s and Exynos5800. + +eDP resides between FIMD and panel or FIMD and bridge such as LVDS. + The Exynos display port interface should be configured based on the type of panel connected to it. @@ -66,8 +83,15 @@ Optional properties for dp-controller: Hotplug detect GPIO. Indicates which GPIO should be used for hotplug detection - -video interfaces: Device node can contain video interface port - nodes according to [1]. +Video interfaces: + Device node can contain video interface port nodes according to [1]. + The following are properties specific to those nodes: + + endpoint node connected to bridge or panel node: + - remote-endpoint: specifies the endpoint in panel or bridge node. + This node is required in all kinds of exynos dp + to represent the connection between dp and bridge + or dp and panel. [1]: Documentation/devicetree/bindings/media/video-interfaces.txt @@ -111,9 +135,18 @@ Board Specific portion: }; ports { - port@0 { + port { dp_out: endpoint { - remote-endpoint = <&bridge_in>; + remote-endpoint = <&dp_in>; + }; + }; + }; + + panel { + ... + port { + dp_in: endpoint { + remote-endpoint = <&dp_out>; }; }; }; diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt b/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt index 1fd8cf9cb..d474f59be 100644 --- a/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt +++ b/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt @@ -2,10 +2,9 @@ Device-Tree bindings for drm hdmi driver Required properties: - compatible: value should be one among the following: - 1) "samsung,exynos5-hdmi" - 2) "samsung,exynos4210-hdmi" - 3) "samsung,exynos4212-hdmi" - 4) "samsung,exynos5420-hdmi" + 1) "samsung,exynos4210-hdmi" + 2) "samsung,exynos4212-hdmi" + 3) "samsung,exynos5420-hdmi" - reg: physical base address of the hdmi and length of memory mapped region. - interrupts: interrupt number to the cpu. diff --git a/Documentation/devicetree/bindings/display/msm/dsi.txt b/Documentation/devicetree/bindings/display/msm/dsi.txt index f344b9e49..e7423bea1 100644 --- a/Documentation/devicetree/bindings/display/msm/dsi.txt +++ b/Documentation/devicetree/bindings/display/msm/dsi.txt @@ -14,17 +14,20 @@ Required properties: - clocks: device clocks See Documentation/devicetree/bindings/clocks/clock-bindings.txt for details. - clock-names: the following clocks are required: + * "mdp_core_clk" + * "iface_clk" * "bus_clk" - * "byte_clk" - * "core_clk" * "core_mmss_clk" - * "iface_clk" - * "mdp_core_clk" + * "byte_clk" * "pixel_clk" + * "core_clk" + For DSIv2, we need an additional clock: + * "src_clk" - vdd-supply: phandle to vdd regulator device node - vddio-supply: phandle to vdd-io regulator device node - vdda-supply: phandle to vdda regulator device node - qcom,dsi-phy: phandle to DSI PHY device node +- syscon-sfpb: A phandle to mmss_sfpb syscon node (only for DSIv2) Optional properties: - panel@0: Node of panel connected to this DSI controller. @@ -51,6 +54,7 @@ Required properties: * "qcom,dsi-phy-28nm-hpm" * "qcom,dsi-phy-28nm-lp" * "qcom,dsi-phy-20nm" + * "qcom,dsi-phy-28nm-8960" - reg: Physical base address and length of the registers of PLL, PHY and PHY regulator - reg-names: The names of register regions. The following regions are required: diff --git a/Documentation/devicetree/bindings/display/msm/mdp.txt b/Documentation/devicetree/bindings/display/msm/mdp.txt index 0833edaba..a214f6cd0 100644 --- a/Documentation/devicetree/bindings/display/msm/mdp.txt +++ b/Documentation/devicetree/bindings/display/msm/mdp.txt @@ -2,18 +2,28 @@ Qualcomm adreno/snapdragon display controller Required properties: - compatible: - * "qcom,mdp" - mdp4 + * "qcom,mdp4" - mdp4 + * "qcom,mdp5" - mdp5 - reg: Physical base address and length of the controller's registers. - interrupts: The interrupt signal from the display controller. - connectors: array of phandles for output device(s) - clocks: device clocks See ../clocks/clock-bindings.txt for details. -- clock-names: the following clocks are required: - * "core_clk" - * "iface_clk" - * "src_clk" - * "hdmi_clk" - * "mpd_clk" +- clock-names: the following clocks are required. + For MDP4: + * "core_clk" + * "iface_clk" + * "lut_clk" + * "src_clk" + * "hdmi_clk" + * "mdp_clk" + For MDP5: + * "bus_clk" + * "iface_clk" + * "core_clk_src" + * "core_clk" + * "lut_clk" (some MDP5 versions may not need this) + * "vsync_clk" Optional properties: - gpus: phandle for gpu device @@ -26,7 +36,7 @@ Example: ... mdp: qcom,mdp@5100000 { - compatible = "qcom,mdp"; + compatible = "qcom,mdp4"; reg = <0x05100000 0xf0000>; interrupts = ; connectors = <&hdmi>; diff --git a/Documentation/devicetree/bindings/display/panel/boe,tv080wum-nl0.txt b/Documentation/devicetree/bindings/display/panel/boe,tv080wum-nl0.txt new file mode 100644 index 000000000..50be5e243 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/boe,tv080wum-nl0.txt @@ -0,0 +1,7 @@ +Boe Corporation 8.0" WUXGA TFT LCD panel + +Required properties: +- compatible: should be "boe,tv080wum-nl0" + +This binding is compatible with the simple-panel binding, which is specified +in simple-panel.txt in this directory. diff --git a/Documentation/devicetree/bindings/display/panel/innolux,g121x1-l03.txt b/Documentation/devicetree/bindings/display/panel/innolux,g121x1-l03.txt new file mode 100644 index 000000000..649744620 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/innolux,g121x1-l03.txt @@ -0,0 +1,7 @@ +Innolux Corporation 12.1" G121X1-L03 XGA (1024x768) TFT LCD panel + +Required properties: +- compatible: should be "innolux,g121x1-l03" + +This binding is compatible with the simple-panel binding, which is specified +in simple-panel.txt in this directory. diff --git a/Documentation/devicetree/bindings/display/panel/kyo,tcg121xglp.txt b/Documentation/devicetree/bindings/display/panel/kyo,tcg121xglp.txt new file mode 100644 index 000000000..a8e940fe7 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/kyo,tcg121xglp.txt @@ -0,0 +1,7 @@ +Kyocera Corporation 12.1" XGA (1024x768) TFT LCD panel + +Required properties: +- compatible: should be "kyo,tcg121xglp" + +This binding is compatible with the simple-panel binding, which is specified +in simple-panel.txt in this directory. diff --git a/Documentation/devicetree/bindings/display/panel/panasonic,vvx10f034n00.txt b/Documentation/devicetree/bindings/display/panel/panasonic,vvx10f034n00.txt new file mode 100644 index 000000000..37dedf6a6 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/panasonic,vvx10f034n00.txt @@ -0,0 +1,20 @@ +Panasonic 10" WUXGA TFT LCD panel + +Required properties: +- compatible: should be "panasonic,vvx10f034n00" +- reg: DSI virtual channel of the peripheral +- power-supply: phandle of the regulator that provides the supply voltage + +Optional properties: +- backlight: phandle of the backlight device attached to the panel + +Example: + + mdss_dsi@fd922800 { + panel@0 { + compatible = "panasonic,vvx10f034n00"; + reg = <0>; + power-supply = <&vreg_vsp>; + backlight = <&lp8566_wled>; + }; + }; diff --git a/Documentation/devicetree/bindings/display/panel/qiaodian,qd43003c0-40.txt b/Documentation/devicetree/bindings/display/panel/qiaodian,qd43003c0-40.txt new file mode 100644 index 000000000..0fbdab89a --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/qiaodian,qd43003c0-40.txt @@ -0,0 +1,7 @@ +QiaoDian XianShi Corporation 4"3 TFT LCD panel + +Required properties: +- compatible: should be "qiaodian,qd43003c0-40" + +This binding is compatible with the simple-panel binding, which is specified +in simple-panel.txt in this directory. diff --git a/Documentation/devicetree/bindings/display/panel/sharp,ls043t1le01.txt b/Documentation/devicetree/bindings/display/panel/sharp,ls043t1le01.txt new file mode 100644 index 000000000..3770a1119 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/sharp,ls043t1le01.txt @@ -0,0 +1,22 @@ +Sharp Microelectronics 4.3" qHD TFT LCD panel + +Required properties: +- compatible: should be "sharp,ls043t1le01-qhd" +- reg: DSI virtual channel of the peripheral +- power-supply: phandle of the regulator that provides the supply voltage + +Optional properties: +- backlight: phandle of the backlight device attached to the panel +- reset-gpios: a GPIO spec for the reset pin + +Example: + + mdss_dsi@fd922800 { + panel@0 { + compatible = "sharp,ls043t1le01-qhd"; + reg = <0>; + avdd-supply = <&pm8941_l22>; + backlight = <&pm8941_wled>; + reset-gpios = <&pm8941_gpios 19 GPIO_ACTIVE_HIGH>; + }; + }; diff --git a/Documentation/devicetree/bindings/display/panel/startek,startek-kd050c.txt b/Documentation/devicetree/bindings/display/panel/startek,startek-kd050c.txt new file mode 100644 index 000000000..70cd8d18d --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/startek,startek-kd050c.txt @@ -0,0 +1,4 @@ +Startek Electronic Technology Co. KD050C 5.0" WVGA TFT LCD panel + +Required properties: +- compatible: should be "startek,startek-kd050c" diff --git a/Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt b/Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt new file mode 100644 index 000000000..1753f0cc6 --- /dev/null +++ b/Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt @@ -0,0 +1,60 @@ +Rockchip specific extensions to the Synopsys Designware MIPI DSI +================================ + +Required properties: +- #address-cells: Should be <1>. +- #size-cells: Should be <0>. +- compatible: "rockchip,rk3288-mipi-dsi", "snps,dw-mipi-dsi". +- reg: Represent the physical address range of the controller. +- interrupts: Represent the controller's interrupt to the CPU(s). +- clocks, clock-names: Phandles to the controller's pll reference + clock(ref) and APB clock(pclk), as described in [1]. +- rockchip,grf: this soc should set GRF regs to mux vopl/vopb. +- ports: contain a port node with endpoint definitions as defined in [2]. + For vopb,set the reg = <0> and set the reg = <1> for vopl. + +[1] Documentation/devicetree/bindings/clock/clock-bindings.txt +[2] Documentation/devicetree/bindings/media/video-interfaces.txt + +Example: + mipi_dsi: mipi@ff960000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "rockchip,rk3288-mipi-dsi", "snps,dw-mipi-dsi"; + reg = <0xff960000 0x4000>; + interrupts = ; + clocks = <&cru SCLK_MIPI_24M>, <&cru PCLK_MIPI_DSI0>; + clock-names = "ref", "pclk"; + rockchip,grf = <&grf>; + status = "okay"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + + mipi_in: port { + #address-cells = <1>; + #size-cells = <0>; + mipi_in_vopb: endpoint@0 { + reg = <0>; + remote-endpoint = <&vopb_out_mipi>; + }; + mipi_in_vopl: endpoint@1 { + reg = <1>; + remote-endpoint = <&vopl_out_mipi>; + }; + }; + }; + + panel { + compatible ="boe,tv080wum-nl0"; + reg = <0>; + + enable-gpios = <&gpio7 3 GPIO_ACTIVE_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&lcd_en>; + backlight = <&backlight>; + status = "okay"; + }; + }; diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt index d15351f23..5489b59e3 100644 --- a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt +++ b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt @@ -7,6 +7,7 @@ buffer to an external LCD interface. Required properties: - compatible: value should be one of the following "rockchip,rk3288-vop"; + "rockchip,rk3036-vop"; - interrupts: should contain a list of all VOP IP block interrupts in the order: VSYNC, LCD_SYSTEM. The interrupt specifier diff --git a/Documentation/devicetree/bindings/display/simple-framebuffer.txt b/Documentation/devicetree/bindings/display/simple-framebuffer.txt index 4474ef6e0..8c9e9f515 100644 --- a/Documentation/devicetree/bindings/display/simple-framebuffer.txt +++ b/Documentation/devicetree/bindings/display/simple-framebuffer.txt @@ -47,10 +47,14 @@ Required properties: - a8b8g8r8 (32-bit pixels, d[31:24]=a, d[23:16]=b, d[15:8]=g, d[7:0]=r). Optional properties: -- clocks : List of clocks used by the framebuffer. Clocks listed here - are expected to already be configured correctly. The OS must - ensure these clocks are not modified or disabled while the - simple framebuffer remains active. +- clocks : List of clocks used by the framebuffer. +- *-supply : Any number of regulators used by the framebuffer. These should + be named according to the names in the device's design. + + The above resources are expected to already be configured correctly. + The OS must ensure they are not modified or disabled while the simple + framebuffer remains active. + - display : phandle pointing to the primary display hardware node Example: @@ -68,6 +72,7 @@ chosen { stride = <(1600 * 2)>; format = "r5g6b5"; clocks = <&ahb_gates 36>, <&ahb_gates 43>, <&ahb_gates 44>; + lcd-supply = <®_dc1sw>; display = <&lcdc0>; }; stdout-path = "display0"; diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt index 09daeef1f..5b902ac8d 100644 --- a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt +++ b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt @@ -14,7 +14,14 @@ not described in these device tree bindings. Required Properties: -- compatible: must contain "renesas,rcar-dmac" +- compatible: "renesas,dmac-", "renesas,rcar-dmac" as fallback. + Examples with soctypes are: + - "renesas,dmac-r8a7790" (R-Car H2) + - "renesas,dmac-r8a7791" (R-Car M2-W) + - "renesas,dmac-r8a7792" (R-Car V2H) + - "renesas,dmac-r8a7793" (R-Car M2-N) + - "renesas,dmac-r8a7794" (R-Car E2) + - "renesas,dmac-r8a7795" (R-Car H3) - reg: base address and length of the registers block for the DMAC @@ -35,7 +42,7 @@ Required Properties: Example: R8A7790 (R-Car H2) SYS-DMACs dmac0: dma-controller@e6700000 { - compatible = "renesas,rcar-dmac"; + compatible = "renesas,dmac-r8a7790", "renesas,rcar-dmac"; reg = <0 0xe6700000 0 0x20000>; interrupts = <0 197 IRQ_TYPE_LEVEL_HIGH 0 200 IRQ_TYPE_LEVEL_HIGH @@ -65,7 +72,7 @@ Example: R8A7790 (R-Car H2) SYS-DMACs }; dmac1: dma-controller@e6720000 { - compatible = "renesas,rcar-dmac"; + compatible = "renesas,dmac-r8a7790", "renesas,rcar-dmac"; reg = <0 0xe6720000 0 0x20000>; interrupts = <0 220 IRQ_TYPE_LEVEL_HIGH 0 216 IRQ_TYPE_LEVEL_HIGH diff --git a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt index 040f36595..e7780a186 100644 --- a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt +++ b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt @@ -1,7 +1,13 @@ * Renesas USB DMA Controller Device Tree bindings Required Properties: -- compatible: must contain "renesas,usb-dmac" +-compatible: "renesas,-usb-dmac", "renesas,usb-dmac" as fallback. + Examples with soctypes are: + - "renesas,r8a7790-usb-dmac" (R-Car H2) + - "renesas,r8a7791-usb-dmac" (R-Car M2-W) + - "renesas,r8a7793-usb-dmac" (R-Car M2-N) + - "renesas,r8a7794-usb-dmac" (R-Car E2) + - "renesas,r8a7795-usb-dmac" (R-Car H3) - reg: base address and length of the registers block for the DMAC - interrupts: interrupt specifiers for the DMAC, one for each entry in interrupt-names. @@ -15,7 +21,7 @@ Required Properties: Example: R8A7790 (R-Car H2) USB-DMACs usb_dmac0: dma-controller@e65a0000 { - compatible = "renesas,usb-dmac"; + compatible = "renesas,r8a7790-usb-dmac", "renesas,usb-dmac"; reg = <0 0xe65a0000 0 0x100>; interrupts = <0 109 IRQ_TYPE_LEVEL_HIGH 0 109 IRQ_TYPE_LEVEL_HIGH>; diff --git a/Documentation/devicetree/bindings/dma/stm32-dma.txt b/Documentation/devicetree/bindings/dma/stm32-dma.txt new file mode 100644 index 000000000..70cd13f15 --- /dev/null +++ b/Documentation/devicetree/bindings/dma/stm32-dma.txt @@ -0,0 +1,82 @@ +* STMicroelectronics STM32 DMA controller + +The STM32 DMA is a general-purpose direct memory access controller capable of +supporting 8 independent DMA channels. Each channel can have up to 8 requests. + +Required properties: +- compatible: Should be "st,stm32-dma" +- reg: Should contain DMA registers location and length. This should include + all of the per-channel registers. +- interrupts: Should contain all of the per-channel DMA interrupts in + ascending order with respect to the DMA channel index. +- clocks: Should contain the input clock of the DMA instance. +- #dma-cells : Must be <4>. See DMA client paragraph for more details. + +Optional properties: +- resets: Reference to a reset controller asserting the DMA controller +- st,mem2mem: boolean; if defined, it indicates that the controller supports + memory-to-memory transfer + +Example: + + dma2: dma-controller@40026400 { + compatible = "st,stm32-dma"; + reg = <0x40026400 0x400>; + interrupts = <56>, + <57>, + <58>, + <59>, + <60>, + <68>, + <69>, + <70>; + clocks = <&clk_hclk>; + #dma-cells = <4>; + st,mem2mem; + resets = <&rcc 150>; + }; + +* DMA client + +DMA clients connected to the STM32 DMA controller must use the format +described in the dma.txt file, using a five-cell specifier for each +channel: a phandle plus four integer cells. +The four cells in order are: + +1. The channel id +2. The request line number +3. A 32bit mask specifying the DMA channel configuration which are device + dependent: + -bit 9: Peripheral Increment Address + 0x0: no address increment between transfers + 0x1: increment address between transfers + -bit 10: Memory Increment Address + 0x0: no address increment between transfers + 0x1: increment address between transfers + -bit 15: Peripheral Increment Offset Size + 0x0: offset size is linked to the peripheral bus width + 0x1: offset size is fixed to 4 (32-bit alignment) + -bit 16-17: Priority level + 0x0: low + 0x1: medium + 0x2: high + 0x3: very high +5. A 32bit mask specifying the DMA FIFO threshold configuration which are device + dependent: + -bit 0-1: Fifo threshold + 0x0: 1/4 full FIFO + 0x1: 1/2 full FIFO + 0x2: 3/4 full FIFO + 0x3: full FIFO + +Example: + + usart1: serial@40011000 { + compatible = "st,stm32-usart", "st,stm32-uart"; + reg = <0x40011000 0x400>; + interrupts = <37>; + clocks = <&clk_pclk2>; + dmas = <&dma2 2 4 0x10400 0x3>, + <&dma2 7 5 0x10200 0x3>; + dma-names = "rx", "tx"; + }; diff --git a/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt index b152a75dc..aead5869a 100644 --- a/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt +++ b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt @@ -14,6 +14,10 @@ The DMA controller node need to have the following poroperties: Optional properties: - ti,dma-safe-map: Safe routing value for unused request lines +- ti,reserved-dma-request-ranges: DMA request ranges which should not be used + when mapping xbar input to DMA request, they are either + allocated to be used by for example the DSP or they are used as + memcpy channels in eDMA. Notes: When requesting channel via ti,dra7-dma-crossbar, the DMA clinet must request @@ -46,6 +50,8 @@ sdma_xbar: dma-router@4a002b78 { #dma-cells = <1>; dma-requests = <205>; ti,dma-safe-map = <0>; + /* Protect the sDMA request ranges: 10-14 and 100-126 */ + ti,reserved-dma-request-ranges = <10 5>, <100 27>; dma-masters = <&sdma>; }; diff --git a/Documentation/devicetree/bindings/eeprom/eeprom.txt b/Documentation/devicetree/bindings/eeprom/eeprom.txt index 4342c10de..735bc9444 100644 --- a/Documentation/devicetree/bindings/eeprom/eeprom.txt +++ b/Documentation/devicetree/bindings/eeprom/eeprom.txt @@ -2,11 +2,22 @@ EEPROMs (I2C) Required properties: - - compatible : should be "," - If there is no specific driver for , a generic - driver based on is selected. Possible types are: - 24c00, 24c01, 24c02, 24c04, 24c08, 24c16, 24c32, 24c64, - 24c128, 24c256, 24c512, 24c1024, spd + - compatible : should be ",", like these: + + "atmel,24c00", "atmel,24c01", "atmel,24c02", "atmel,24c04", + "atmel,24c08", "atmel,24c16", "atmel,24c32", "atmel,24c64", + "atmel,24c128", "atmel,24c256", "atmel,24c512", "atmel,24c1024" + + "catalyst,24c32" + + "ramtron,24c64" + + "renesas,r1ex24002" + + If there is no specific driver for , a generic + driver based on is selected. Possible types are: + "24c00", "24c01", "24c02", "24c04", "24c08", "24c16", "24c32", "24c64", + "24c128", "24c256", "24c512", "24c1024", "spd" - reg : the I2C address of the EEPROM diff --git a/Documentation/devicetree/bindings/extcon/extcon-arizona.txt b/Documentation/devicetree/bindings/extcon/extcon-arizona.txt index e1705fae6..e27341f8a 100644 --- a/Documentation/devicetree/bindings/extcon/extcon-arizona.txt +++ b/Documentation/devicetree/bindings/extcon/extcon-arizona.txt @@ -13,3 +13,63 @@ Optional properties: ARIZONA_ACCDET_MODE_HPR or 2 - Headphone detect mode is set to HPDETR If this node is not mentioned or if the value is unknown, then headphone detection mode is set to HPDETL. + + - wlf,use-jd2 : Use the additional JD input along with JD1 for dual pin jack + detection. + - wlf,use-jd2-nopull : Internal pull on JD2 is disabled when used for + jack detection. + - wlf,jd-invert : Invert the polarity of the jack detection switch + + - wlf,micd-software-compare : Use a software comparison to determine mic + presence + - wlf,micd-detect-debounce : Additional software microphone detection + debounce specified in milliseconds. + - wlf,micd-pol-gpio : GPIO specifier for the GPIO controlling the headset + polarity if one exists. + - wlf,micd-bias-start-time : Time allowed for MICBIAS to startup prior to + performing microphone detection, specified as per the ARIZONA_MICD_TIME_XXX + defines. + - wlf,micd-rate : Delay between successive microphone detection measurements, + specified as per the ARIZONA_MICD_TIME_XXX defines. + - wlf,micd-dbtime : Microphone detection hardware debounces specified as the + number of measurements to take, valid values being 2 and 4. + - wlf,micd-timeout-ms : Timeout for microphone detection, specified in + milliseconds. + - wlf,micd-force-micbias : Force MICBIAS continuously on during microphone + detection. + - wlf,micd-configs : Headset polarity configurations (generally used for + detection of CTIA / OMTP headsets), the field can be of variable length + but should always be a multiple of 3 cells long, each three cell group + represents one polarity configuration. + The first cell defines the accessory detection pin, zero will use MICDET1 + and all other values will use MICDET2. + The second cell represents the MICBIAS to be used. + The third cell represents the value of the micd-pol-gpio pin. + + - wlf,gpsw : Settings for the general purpose switch + +Example: + +codec: wm8280@0 { + compatible = "wlf,wm8280"; + reg = <0>; + ... + + wlf,use-jd2; + wlf,use-jd2-nopull; + wlf,jd-invert; + + wlf,micd-software-compare; + wlf,micd-detect-debounce = <0>; + wlf,micd-pol-gpio = <&codec 2 0>; + wlf,micd-rate = ; + wlf,micd-dbtime = <4>; + wlf,micd-timeout-ms = <100>; + wlf,micd-force-micbias; + wlf,micd-configs = < + 0 1 0 /* MICDET1 MICBIAS1 GPIO=low */ + 1 2 1 /* MICDET2 MICBIAS2 GPIO=high */ + >; + + wlf,gpsw = <0>; +}; diff --git a/Documentation/devicetree/bindings/extcon/extcon-max3355.txt b/Documentation/devicetree/bindings/extcon/extcon-max3355.txt new file mode 100644 index 000000000..f2288ea9e --- /dev/null +++ b/Documentation/devicetree/bindings/extcon/extcon-max3355.txt @@ -0,0 +1,21 @@ +Maxim Integrated MAX3355 USB OTG chip +------------------------------------- + +MAX3355 integrates a charge pump and comparators to enable a system with an +integrated USB OTG dual-role transceiver to function as a USB OTG dual-role +device. + +Required properties: +- compatible: should be "maxim,max3355"; +- maxim,shdn-gpios: should contain a phandle and GPIO specifier for the GPIO pin + connected to the MAX3355's SHDN# pin; +- id-gpios: should contain a phandle and GPIO specifier for the GPIO pin + connected to the MAX3355's ID_OUT pin. + +Example: + + usb-otg { + compatible = "maxim,max3355"; + maxim,shdn-gpios = <&gpio2 4 GPIO_ACTIVE_LOW>; + id-gpios = <&gpio5 31 GPIO_ACTIVE_HIGH>; + }; diff --git a/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt b/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt index 13df9933f..6b4a98f74 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt @@ -25,6 +25,7 @@ Required properties: ti,tca6416 ti,tca6424 ti,tca9539 + onsemi,pca9654 exar,xra1202 Example: diff --git a/Documentation/devicetree/bindings/gpio/gpio-sx150x.txt b/Documentation/devicetree/bindings/gpio/gpio-sx150x.txt index ba2bb84ee..c809acb9c 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-sx150x.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-sx150x.txt @@ -5,7 +5,8 @@ Required properties: - compatible: should be "semtech,sx1506q", "semtech,sx1508q", - "semtech,sx1509q". + "semtech,sx1509q", + "semtech,sx1502q". - reg: The I2C slave address for this device. diff --git a/Documentation/devicetree/bindings/gpio/gpio-tps65086.txt b/Documentation/devicetree/bindings/gpio/gpio-tps65086.txt new file mode 100644 index 000000000..ba051074b --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/gpio-tps65086.txt @@ -0,0 +1,16 @@ +* TPS65086 GPO Controller bindings + +Required properties: + - compatible : Should be "ti,tps65086-gpio". + - gpio-controller : Marks the device node as a GPIO Controller. + - #gpio-cells : Should be two. The first cell is the pin number + and the second cell is used to specify flags. + See ../gpio/gpio.txt for possible values. + +Example: + + gpio4: gpio { + compatible = "ti,tps65086-gpio"; + gpio-controller; + #gpio-cells = <2>; + }; diff --git a/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt b/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt index dd5d2c039..4d6c8cdc8 100644 --- a/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt +++ b/Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt @@ -24,7 +24,7 @@ controller. - #interrupt-cells : Specifies the number of cells needed to encode an interrupt. Shall be set to 2. The first cell defines the interrupt number, the second encodes the triger flags encoded as described in - Documentation/devicetree/bindings/interrupts.txt + Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - interrupt-parent : The parent interrupt controller. - interrupts : The interrupt to the parent controller raised when GPIOs generate the interrupts. diff --git a/Documentation/devicetree/bindings/i2c/i2c-at91.txt b/Documentation/devicetree/bindings/i2c/i2c-at91.txt index 6e81dc153..ef973a034 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-at91.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-at91.txt @@ -3,7 +3,7 @@ I2C for Atmel platforms Required properties : - compatible : Must be "atmel,at91rm9200-i2c", "atmel,at91sam9261-i2c", "atmel,at91sam9260-i2c", "atmel,at91sam9g20-i2c", "atmel,at91sam9g10-i2c", - "atmel,at91sam9x5-i2c" or "atmel,sama5d2-i2c" + "atmel,at91sam9x5-i2c", "atmel,sama5d4-i2c" or "atmel,sama5d2-i2c" - reg: physical base address of the controller and length of memory mapped region. - interrupts: interrupt number to the cpu. @@ -17,6 +17,8 @@ Optional properties: - dma-names: should contain "tx" and "rx". - atmel,fifo-size: maximum number of data the RX and TX FIFOs can store for FIFO capable I2C controllers. +- i2c-sda-hold-time-ns: TWD hold time, only available for "atmel,sama5d4-i2c" + and "atmel,sama5d2-i2c". - Child nodes conforming to i2c bus binding Examples : @@ -52,6 +54,7 @@ i2c0: i2c@f8034600 { #size-cells = <0>; clocks = <&flx0>; atmel,fifo-size = <16>; + i2c-sda-hold-time-ns = <336>; wm8731: wm8731@1a { compatible = "wm8731"; diff --git a/Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt b/Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt index d6f724efd..aeceaceba 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt @@ -2,7 +2,7 @@ Broadcom stb bsc iic master controller Required properties: -- compatible: should be "brcm,brcmstb-i2c" +- compatible: should be "brcm,brcmstb-i2c" or "brcm,brcmper-i2c" - clock-frequency: 32-bit decimal value of iic master clock freqency in Hz valid values are 375000, 390000, 187500, 200000 93750, 97500, 46875 and 50000 diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/i2c-rcar.txt index ea406eb20..95e97223a 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-rcar.txt @@ -20,6 +20,10 @@ Optional properties: propoerty indicates the default frequency 100 kHz. - clocks: clock specifier. +- i2c-scl-falling-time-ns: see i2c.txt +- i2c-scl-internal-delay-ns: see i2c.txt +- i2c-scl-rising-time-ns: see i2c.txt + Examples : i2c0: i2c@e6508000 { diff --git a/Documentation/devicetree/bindings/i2c/i2c.txt b/Documentation/devicetree/bindings/i2c/i2c.txt index 8a99150ac..c8d977ed8 100644 --- a/Documentation/devicetree/bindings/i2c/i2c.txt +++ b/Documentation/devicetree/bindings/i2c/i2c.txt @@ -29,12 +29,38 @@ Optional properties These properties may not be supported by all drivers. However, if a driver wants to support one of the below features, it should adapt the bindings below. -- clock-frequency - frequency of bus clock in Hz. -- wakeup-source - device can be used as a wakeup source. +- clock-frequency + frequency of bus clock in Hz. -- interrupts - interrupts used by the device. -- interrupt-names - "irq" and "wakeup" names are recognized by I2C core, - other names are left to individual drivers. +- i2c-scl-falling-time-ns + Number of nanoseconds the SCL signal takes to fall; t(f) in the I2C + specification. + +- i2c-scl-internal-delay-ns + Number of nanoseconds the IP core additionally needs to setup SCL. + +- i2c-scl-rising-time-ns + Number of nanoseconds the SCL signal takes to rise; t(r) in the I2C + specification. + +- i2c-sda-falling-time-ns + Number of nanoseconds the SDA signal takes to fall; t(f) in the I2C + specification. + +- interrupts + interrupts used by the device. + +- interrupt-names + "irq" and "wakeup" names are recognized by I2C core, other names are + left to individual drivers. + +- multi-master + states that there is another master active on this bus. The OS can use + this information to adapt power management to keep the arbitration awake + all the time, for example. + +- wakeup-source + device can be used as a wakeup source. Binding may contain optional "interrupts" property, describing interrupts used by the device. I2C core will assign "irq" interrupt (or the very first diff --git a/Documentation/devicetree/bindings/i2c/trivial-devices.txt b/Documentation/devicetree/bindings/i2c/trivial-devices.txt index c50cf13c8..539874490 100644 --- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt +++ b/Documentation/devicetree/bindings/i2c/trivial-devices.txt @@ -20,22 +20,11 @@ adi,adt7476 +/-1C TDM Extended Temp Range I.C adi,adt7490 +/-1C TDM Extended Temp Range I.C adi,adxl345 Three-Axis Digital Accelerometer adi,adxl346 Three-Axis Digital Accelerometer (backward-compatibility value "adi,adxl345" must be listed too) +ams,iaq-core AMS iAQ-Core VOC Sensor at,24c08 i2c serial eeprom (24cxx) -atmel,24c00 i2c serial eeprom (24cxx) -atmel,24c01 i2c serial eeprom (24cxx) -atmel,24c02 i2c serial eeprom (24cxx) -atmel,24c04 i2c serial eeprom (24cxx) -atmel,24c16 i2c serial eeprom (24cxx) -atmel,24c32 i2c serial eeprom (24cxx) -atmel,24c64 i2c serial eeprom (24cxx) -atmel,24c128 i2c serial eeprom (24cxx) -atmel,24c256 i2c serial eeprom (24cxx) -atmel,24c512 i2c serial eeprom (24cxx) -atmel,24c1024 i2c serial eeprom (24cxx) atmel,at97sc3204t i2c trusted platform module (TPM) capella,cm32181 CM32181: Ambient Light Sensor capella,cm3232 CM3232: Ambient Light Sensor -catalyst,24c32 i2c serial eeprom cirrus,cs42l51 Cirrus Logic CS42L51 audio codec dallas,ds1307 64 x 8, Serial, I2C Real-Time Clock dallas,ds1338 I2C RTC with 56-Byte NV RAM @@ -49,11 +38,13 @@ dallas,ds4510 CPU Supervisor with Nonvolatile Memory and Programmable I/O dallas,ds75 Digital Thermometer and Thermostat dlg,da9053 DA9053: flexible system level PMIC with multicore support dlg,da9063 DA9063: system PMIC for quad-core application processors +epson,rx8010 I2C-BUS INTERFACE REAL TIME CLOCK MODULE epson,rx8025 High-Stability. I2C-Bus INTERFACE REAL TIME CLOCK MODULE epson,rx8581 I2C-BUS INTERFACE REAL TIME CLOCK MODULE fsl,mag3110 MAG3110: Xtrinsic High Accuracy, 3D Magnetometer fsl,mc13892 MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51 fsl,mma8450 MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer +fsl,mpl3115 MPL3115: Absolute Digital Pressure Sensor fsl,mpr121 MPR121: Proximity Capacitive Touch Sensor Controller fsl,sgtl5000 SGTL5000: Ultra Low-Power Audio Codec gmt,g751 G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface @@ -80,7 +71,6 @@ ovti,ov5642 OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI an pericom,pt7c4338 Real-time Clock Module plx,pex8648 48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch pulsedlight,lidar-lite-v2 Pulsedlight LIDAR range-finding sensor -ramtron,24c64 i2c serial eeprom (24cxx) ricoh,r2025sd I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC ricoh,r2221tl I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC ricoh,rs5c372a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC diff --git a/Documentation/devicetree/bindings/iio/accel/mma8452.txt b/Documentation/devicetree/bindings/iio/accel/mma8452.txt index e3c37467d..3c10e8581 100644 --- a/Documentation/devicetree/bindings/iio/accel/mma8452.txt +++ b/Documentation/devicetree/bindings/iio/accel/mma8452.txt @@ -7,13 +7,18 @@ Required properties: * "fsl,mma8453" * "fsl,mma8652" * "fsl,mma8653" + - reg: the I2C address of the chip Optional properties: - interrupt-parent: should be the phandle for the interrupt controller + - interrupts: interrupt mapping for GPIO IRQ + - interrupt-names: should contain "INT1" and/or "INT2", the accelerometer's + interrupt line in use. + Example: mma8453fc@1d { @@ -21,4 +26,5 @@ Example: reg = <0x1d>; interrupt-parent = <&gpio1>; interrupts = <5 0>; + interrupt-names = "INT2"; }; diff --git a/Documentation/devicetree/bindings/iio/adc/imx7d-adc.txt b/Documentation/devicetree/bindings/iio/adc/imx7d-adc.txt new file mode 100644 index 000000000..5c184b940 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/imx7d-adc.txt @@ -0,0 +1,22 @@ +Freescale imx7d ADC bindings + +The devicetree bindings are for the ADC driver written for +imx7d SoC. + +Required properties: +- compatible: Should be "fsl,imx7d-adc" +- reg: Offset and length of the register set for the ADC device +- interrupts: The interrupt number for the ADC device +- clocks: The root clock of the ADC controller +- clock-names: Must contain "adc", matching entry in the clocks property +- vref-supply: The regulator supply ADC reference voltage + +Example: +adc1: adc@30610000 { + compatible = "fsl,imx7d-adc"; + reg = <0x30610000 0x10000>; + interrupts = ; + clocks = <&clks IMX7D_ADC_ROOT_CLK>; + clock-names = "adc"; + vref-supply = <®_vcc_3v3_mcu>; +}; diff --git a/Documentation/devicetree/bindings/iio/adc/mcp320x.txt b/Documentation/devicetree/bindings/iio/adc/mcp320x.txt index 2a1f3af30..bcd3ac8e6 100644 --- a/Documentation/devicetree/bindings/iio/adc/mcp320x.txt +++ b/Documentation/devicetree/bindings/iio/adc/mcp320x.txt @@ -10,16 +10,28 @@ must be specified. Required properties: - compatible: Must be one of the following, depending on the model: - "mcp3001" - "mcp3002" - "mcp3004" - "mcp3008" - "mcp3201" - "mcp3202" - "mcp3204" - "mcp3208" - "mcp3301" + "mcp3001" (DEPRECATED) + "mcp3002" (DEPRECATED) + "mcp3004" (DEPRECATED) + "mcp3008" (DEPRECATED) + "mcp3201" (DEPRECATED) + "mcp3202" (DEPRECATED) + "mcp3204" (DEPRECATED) + "mcp3208" (DEPRECATED) + "mcp3301" (DEPRECATED) + "microchip,mcp3001" + "microchip,mcp3002" + "microchip,mcp3004" + "microchip,mcp3008" + "microchip,mcp3201" + "microchip,mcp3202" + "microchip,mcp3204" + "microchip,mcp3208" + "microchip,mcp3301" + + NOTE: The use of the compatibles with no vendor prefix + is deprecated and only listed because old DT use them. Examples: spi_controller { diff --git a/Documentation/devicetree/bindings/iio/adc/mcp3422.txt b/Documentation/devicetree/bindings/iio/adc/mcp3422.txt index 333139cc0..dcae4ccfc 100644 --- a/Documentation/devicetree/bindings/iio/adc/mcp3422.txt +++ b/Documentation/devicetree/bindings/iio/adc/mcp3422.txt @@ -1,7 +1,8 @@ -* Microchip mcp3422/3/4/6/7/8 chip family (ADC) +* Microchip mcp3421/2/3/4/6/7/8 chip family (ADC) Required properties: - compatible: Should be + "microchip,mcp3421" or "microchip,mcp3422" or "microchip,mcp3423" or "microchip,mcp3424" or diff --git a/Documentation/devicetree/bindings/iio/adc/palmas-gpadc.txt b/Documentation/devicetree/bindings/iio/adc/palmas-gpadc.txt new file mode 100644 index 000000000..4bb9a8606 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/palmas-gpadc.txt @@ -0,0 +1,48 @@ +* Palmas general purpose ADC IP block devicetree bindings + +Channels list: + 0 battery type + 1 battery temp NTC (optional current source) + 2 GP + 3 temp (with ext. diode, optional current source) + 4 GP + 5 GP + 6 VBAT_SENSE + 7 VCC_SENSE + 8 Backup Battery voltage + 9 external charger (VCHG) + 10 VBUS + 11 DC-DC current probe (how does this work?) + 12 internal die temp + 13 internal die temp + 14 USB ID pin voltage + 15 test network + +Required properties: +- compatible : Must be "ti,palmas-gpadc". +- #io-channel-cells: Should be set to <1>. + +Optional sub-nodes: +ti,channel0-current-microamp: Channel 0 current in uA. + Values are rounded to derive 0uA, 5uA, 15uA, 20uA. +ti,channel3-current-microamp: Channel 3 current in uA. + Values are rounded to derive 0uA, 10uA, 400uA, 800uA. +ti,enable-extended-delay: Enable extended delay. + +Example: + +pmic { + compatible = "ti,twl6035-pmic", "ti,palmas-pmic"; + ... + gpadc { + compatible = "ti,palmas-gpadc"; + interrupts = <18 0 + 16 0 + 17 0>; + #io-channel-cells = <1>; + ti,channel0-current-microamp = <5>; + ti,channel3-current-microamp = <10>; + }; + }; + ... +}; diff --git a/Documentation/devicetree/bindings/iio/adc/ti-adc128s052.txt b/Documentation/devicetree/bindings/iio/adc/ti-adc128s052.txt index 15ca6b479..daa2b2c29 100644 --- a/Documentation/devicetree/bindings/iio/adc/ti-adc128s052.txt +++ b/Documentation/devicetree/bindings/iio/adc/ti-adc128s052.txt @@ -1,7 +1,7 @@ -* Texas Instruments' ADC128S052 and ADC122S021 ADC chip +* Texas Instruments' ADC128S052, ADC122S021 and ADC124S021 ADC chip Required properties: - - compatible: Should be "ti,adc128s052" or "ti,adc122s021" + - compatible: Should be "ti,adc128s052", "ti,adc122s021" or "ti,adc124s021" - reg: spi chip select number for the device - vref-supply: The regulator supply for ADC reference voltage diff --git a/Documentation/devicetree/bindings/iio/adc/ti-ads8688.txt b/Documentation/devicetree/bindings/iio/adc/ti-ads8688.txt new file mode 100644 index 000000000..a02337d7e --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/ti-ads8688.txt @@ -0,0 +1,20 @@ +* Texas Instruments' ADS8684 and ADS8688 ADC chip + +Required properties: + - compatible: Should be "ti,ads8684" or "ti,ads8688" + - reg: spi chip select number for the device + +Recommended properties: + - spi-max-frequency: Definition as per + Documentation/devicetree/bindings/spi/spi-bus.txt + +Optional properties: + - vref-supply: The regulator supply for ADC reference voltage + +Example: +adc@0 { + compatible = "ti,ads8688"; + reg = <0>; + vref-supply = <&vdd_supply>; + spi-max-frequency = <1000000>; +}; diff --git a/Documentation/devicetree/bindings/iio/health/max30100.txt b/Documentation/devicetree/bindings/iio/health/max30100.txt new file mode 100644 index 000000000..f6fbac66a --- /dev/null +++ b/Documentation/devicetree/bindings/iio/health/max30100.txt @@ -0,0 +1,21 @@ +Maxim MAX30100 heart rate and pulse oximeter sensor + +* https://datasheets.maximintegrated.com/en/ds/MAX30100.pdf + +Required properties: + - compatible: must be "maxim,max30100" + - reg: the I2C address of the sensor + - interrupt-parent: should be the phandle for the interrupt controller + - interrupts: the sole interrupt generated by the device + + Refer to interrupt-controller/interrupts.txt for generic + interrupt client node bindings. + +Example: + +max30100@057 { + compatible = "maxim,max30100"; + reg = <57>; + interrupt-parent = <&gpio1>; + interrupts = <16 2>; +}; diff --git a/Documentation/devicetree/bindings/iio/light/us5182d.txt b/Documentation/devicetree/bindings/iio/light/us5182d.txt index 6f0a53014..a61979997 100644 --- a/Documentation/devicetree/bindings/iio/light/us5182d.txt +++ b/Documentation/devicetree/bindings/iio/light/us5182d.txt @@ -7,13 +7,24 @@ Required properties: Optional properties: - upisemi,glass-coef: glass attenuation factor - compensation factor of resolution 1000 for material transmittance. + - upisemi,dark-ths: array of 8 elements containing 16-bit thresholds (adc counts) corresponding to every scale. + - upisemi,upper-dark-gain: 8-bit dark gain compensation factor(4 int and 4 fractional bits - Q4.4) applied when light > threshold + - upisemi,lower-dark-gain: 8-bit dark gain compensation factor(4 int and 4 fractional bits - Q4.4) applied when light < threshold +- upisemi,continuous: This chip has two power modes: one-shot (chip takes one + measurement and then shuts itself down) and continuous ( + chip takes continuous measurements). The one-shot mode is + more power-friendly but the continuous mode may be more + reliable. If this property is specified the continuous + mode will be used instead of the default one-shot one for + raw reads. + If the optional properties are not specified these factors will default to the values in the below example. The glass-coef defaults to no compensation for the covering material. diff --git a/Documentation/devicetree/bindings/iio/st-sensors.txt b/Documentation/devicetree/bindings/iio/st-sensors.txt index d3ccdb190..d4b87cc1e 100644 --- a/Documentation/devicetree/bindings/iio/st-sensors.txt +++ b/Documentation/devicetree/bindings/iio/st-sensors.txt @@ -36,6 +36,7 @@ Accelerometers: - st,lsm303dlm-accel - st,lsm330-accel - st,lsm303agr-accel +- st,lis2dh12-accel Gyroscopes: - st,l3g4200d-gyro diff --git a/Documentation/devicetree/bindings/input/gpio-keys.txt b/Documentation/devicetree/bindings/input/gpio-keys.txt index cf1333d1d..21641236c 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.txt +++ b/Documentation/devicetree/bindings/input/gpio-keys.txt @@ -6,6 +6,7 @@ Required properties: Optional properties: - autorepeat: Boolean, Enable auto repeat feature of Linux input subsystem. + - label: String, name of the input device. Each button (key) is represented as a sub-node of "gpio-keys": Subnode properties: diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun67i-sc-nmi.txt b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun67i-sc-nmi.txt deleted file mode 100644 index d1c5cdabc..000000000 --- a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun67i-sc-nmi.txt +++ /dev/null @@ -1,27 +0,0 @@ -Allwinner Sunxi NMI Controller -============================== - -Required properties: - -- compatible : should be "allwinner,sun7i-a20-sc-nmi" or - "allwinner,sun6i-a31-sc-nmi" -- reg : Specifies base physical address and size of the registers. -- interrupt-controller : Identifies the node as an interrupt controller -- #interrupt-cells : Specifies the number of cells needed to encode an - interrupt source. The value shall be 2. The first cell is the IRQ number, the - second cell the trigger type as defined in interrupt.txt in this directory. -- interrupt-parent: Specifies the parent interrupt controller. -- interrupts: Specifies the interrupt line (NMI) which is handled by - the interrupt controller in the parent controller's notation. This value - shall be the NMI. - -Example: - -sc-nmi-intc@01c00030 { - compatible = "allwinner,sun7i-a20-sc-nmi"; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x01c00030 0x0c>; - interrupt-parent = <&gic>; - interrupts = <0 0 4>; -}; diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sunxi-nmi.txt b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sunxi-nmi.txt new file mode 100644 index 000000000..81cd36924 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sunxi-nmi.txt @@ -0,0 +1,27 @@ +Allwinner Sunxi NMI Controller +============================== + +Required properties: + +- compatible : should be "allwinner,sun7i-a20-sc-nmi" or + "allwinner,sun6i-a31-sc-nmi" or "allwinner,sun9i-a80-nmi" +- reg : Specifies base physical address and size of the registers. +- interrupt-controller : Identifies the node as an interrupt controller +- #interrupt-cells : Specifies the number of cells needed to encode an + interrupt source. The value shall be 2. The first cell is the IRQ number, the + second cell the trigger type as defined in interrupt.txt in this directory. +- interrupt-parent: Specifies the parent interrupt controller. +- interrupts: Specifies the interrupt line (NMI) which is handled by + the interrupt controller in the parent controller's notation. This value + shall be the NMI. + +Example: + +sc-nmi-intc@01c00030 { + compatible = "allwinner,sun7i-a20-sc-nmi"; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x01c00030 0x0c>; + interrupt-parent = <&gic>; + interrupts = <0 0 4>; +}; diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt index 7803e77d8..007a5b462 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt @@ -24,9 +24,8 @@ Main node required properties: 1 = edge triggered 4 = level triggered - Cells 4 and beyond are reserved for future use. When the 1st cell - has a value of 0 or 1, cells 4 and beyond act as padding, and may be - ignored. It is recommended that padding cells have a value of 0. + Cells 4 and beyond are reserved for future use and must have a value + of 0 if present. - reg : Specifies base physical address(s) and size of the GIC registers, in the following order: diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt index cc56021eb..5a1cb4bc3 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt @@ -18,6 +18,7 @@ Main node required properties: "arm,cortex-a9-gic" "arm,gic-400" "arm,pl390" + "arm,tc11mp-gic" "brcm,brahma-b15-gic" "qcom,msm-8660-qgic" "qcom,msm-qgic2" diff --git a/Documentation/devicetree/bindings/interrupt-controller/hisilicon,mbigen-v2.txt b/Documentation/devicetree/bindings/interrupt-controller/hisilicon,mbigen-v2.txt new file mode 100644 index 000000000..720f7c92e --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/hisilicon,mbigen-v2.txt @@ -0,0 +1,74 @@ +Hisilicon mbigen device tree bindings. +======================================= + +Mbigen means: message based interrupt generator. + +MBI is kind of msi interrupt only used on Non-PCI devices. + +To reduce the wired interrupt number connected to GIC, +Hisilicon designed mbigen to collect and generate interrupt. + + +Non-pci devices can connect to mbigen and generate the +interrupt by writing ITS register. + +The mbigen chip and devices connect to mbigen have the following properties: + +Mbigen main node required properties: +------------------------------------------- +- compatible: Should be "hisilicon,mbigen-v2" + +- reg: Specifies the base physical address and size of the Mbigen + registers. + +- interrupt controller: Identifies the node as an interrupt controller + +- msi-parent: Specifies the MSI controller this mbigen use. + For more detail information,please refer to the generic msi-parent binding in + Documentation/devicetree/bindings/interrupt-controller/msi.txt. + +- num-pins: the total number of pins implemented in this Mbigen + instance. + +- #interrupt-cells : Specifies the number of cells needed to encode an + interrupt source. The value must be 2. + + The 1st cell is hardware pin number of the interrupt.This number is local to + each mbigen chip and in the range from 0 to the maximum interrupts number + of the mbigen. + + The 2nd cell is the interrupt trigger type. + The value of this cell should be: + 1: rising edge triggered + or + 4: high level triggered + +Examples: + + mbigen_device_gmac:intc { + compatible = "hisilicon,mbigen-v2"; + reg = <0x0 0xc0080000 0x0 0x10000>; + interrupt-controller; + msi-parent = <&its_dsa 0x40b1c>; + num-pins = <9>; + #interrupt-cells = <2>; + }; + +Devices connect to mbigen required properties: +---------------------------------------------------- +-interrupt-parent: Specifies the mbigen device node which device connected. + +-interrupts:Specifies the interrupt source. + For the specific information of each cell in this property,please refer to + the "interrupt-cells" description mentioned above. + +Examples: + gmac0: ethernet@c2080000 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0 0xc2080000 0 0x20000>, + <0 0xc0000000 0 0x1000>; + interrupt-parent = <&mbigen_device_gmac>; + interrupts = <656 1>, + <657 1>; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt b/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt index afef6a85a..b8e1674c7 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt @@ -14,6 +14,7 @@ Required properties: "mediatek,mt6582-sysirq" "mediatek,mt6580-sysirq" "mediatek,mt6577-sysirq" + "mediatek,mt2701-sysirq" - interrupt-controller : Identifies the node as an interrupt controller - #interrupt-cells : Use the same format as specified by GIC in Documentation/devicetree/bindings/arm/gic.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/microchip,pic32-evic.txt b/Documentation/devicetree/bindings/interrupt-controller/microchip,pic32-evic.txt new file mode 100644 index 000000000..c3a1b37c4 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/microchip,pic32-evic.txt @@ -0,0 +1,67 @@ +Microchip PIC32 Interrupt Controller +==================================== + +The Microchip PIC32 contains an Enhanced Vectored Interrupt Controller (EVIC). +It handles all internal and external interrupts. This controller exists outside +of the CPU and is the arbitrator of all interrupts (including interrupts from +the CPU itself) before they are presented to the CPU. + +External interrupts have a software configurable edge polarity. Non external +interrupts have a type and polarity that is determined by the source of the +interrupt. + +Required properties +------------------- + +- compatible: Should be "microchip,pic32mzda-evic" +- reg: Specifies physical base address and size of register range. +- interrupt-controller: Identifies the node as an interrupt controller. +- #interrupt cells: Specifies the number of cells used to encode an interrupt + source connected to this controller. The value shall be 2 and interrupt + descriptor shall have the following format: + + + + hw_irq - represents the hardware interrupt number as in the data sheet. + irq_type - is used to describe the type and polarity of an interrupt. For + internal interrupts use IRQ_TYPE_EDGE_RISING for non persistent interrupts and + IRQ_TYPE_LEVEL_HIGH for persistent interrupts. For external interrupts use + IRQ_TYPE_EDGE_RISING or IRQ_TYPE_EDGE_FALLING to select the desired polarity. + +Optional properties +------------------- +- microchip,external-irqs: u32 array of external interrupts with software + polarity configuration. This array corresponds to the bits in the INTCON + SFR. + +Example +------- + +evic: interrupt-controller@1f810000 { + compatible = "microchip,pic32mzda-evic"; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1f810000 0x1000>; + microchip,external-irqs = <3 8 13 18 23>; +}; + +Each device/peripheral must request its interrupt line with the associated type +and polarity. + +Internal interrupt DTS snippet +------------------------------ + +device@1f800000 { + ... + interrupts = <113 IRQ_TYPE_LEVEL_HIGH>; + ... +}; + +External interrupt DTS snippet +------------------------------ + +device@1f800000 { + ... + interrupts = <3 IRQ_TYPE_EDGE_RISING>; + ... +}; diff --git a/Documentation/devicetree/bindings/interrupt-controller/qca,ath79-misc-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/qca,ath79-misc-intc.txt index ec96b1f01..475ae9bd5 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/qca,ath79-misc-intc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/qca,ath79-misc-intc.txt @@ -22,7 +22,7 @@ Interrupt Controllers bindings used by client devices. Example: interrupt-controller@18060010 { - compatible = "qca,ar9132-misc-intc", qca,ar7100-misc-intc"; + compatible = "qca,ar9132-misc-intc", "qca,ar7100-misc-intc"; reg = <0x18060010 0x4>; interrupt-parent = <&cpuintc>; diff --git a/Documentation/devicetree/bindings/interrupt-controller/technologic,ts4800.txt b/Documentation/devicetree/bindings/interrupt-controller/technologic,ts4800.txt new file mode 100644 index 000000000..7f15f1b03 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/technologic,ts4800.txt @@ -0,0 +1,16 @@ +TS-4800 FPGA interrupt controller + +TS-4800 FPGA has an internal interrupt controller. When one of the +interrupts is triggered, the SoC is notified, usually using a GPIO as +parent interrupt source. + +Required properties: +- compatible: should be "technologic,ts4800-irqc" +- interrupt-controller: identifies the node as an interrupt controller +- reg: physical base address of the controller and length of memory mapped + region +- #interrupt-cells: specifies the number of cells needed to encode an interrupt + source, should be 1. +- interrupt-parent: phandle to the parent interrupt controller this one is + cascaded from +- interrupts: specifies the interrupt line in the interrupt-parent controller diff --git a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt index cd29083e1..48ffb38f6 100644 --- a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt +++ b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt @@ -7,7 +7,15 @@ connected to the IPMMU through a port called micro-TLB. Required Properties: - - compatible: Must contain "renesas,ipmmu-vmsa". + - compatible: Must contain SoC-specific and generic entries from below. + + - "renesas,ipmmu-r8a73a4" for the R8A73A4 (R-Mobile APE6) IPMMU. + - "renesas,ipmmu-r8a7790" for the R8A7790 (R-Car H2) IPMMU. + - "renesas,ipmmu-r8a7791" for the R8A7791 (R-Car M2-W) IPMMU. + - "renesas,ipmmu-r8a7793" for the R8A7793 (R-Car M2-N) IPMMU. + - "renesas,ipmmu-r8a7794" for the R8A7794 (R-Car E2) IPMMU. + - "renesas,ipmmu-vmsa" for generic R-Car Gen2 VMSA-compatible IPMMU. + - reg: Base address and size of the IPMMU registers. - interrupts: Specifiers for the MMU fault interrupts. For instances that support secure mode two interrupts must be specified, for non-secure and @@ -27,7 +35,7 @@ node with the following property: Example: R8A7791 IPMMU-MX and VSP1-D0 bus master ipmmu_mx: mmu@fe951000 { - compatible = "renasas,ipmmu-vmsa"; + compatible = "renasas,ipmmu-r8a7791", "renasas,ipmmu-vmsa"; reg = <0 0xfe951000 0 0x1000>; interrupts = <0 222 IRQ_TYPE_LEVEL_HIGH>, <0 221 IRQ_TYPE_LEVEL_HIGH>; diff --git a/Documentation/devicetree/bindings/media/exynos5-gsc.txt b/Documentation/devicetree/bindings/media/exynos5-gsc.txt index 0604d42f3..5fe9372ab 100644 --- a/Documentation/devicetree/bindings/media/exynos5-gsc.txt +++ b/Documentation/devicetree/bindings/media/exynos5-gsc.txt @@ -7,6 +7,10 @@ Required properties: - reg: should contain G-Scaler physical address location and length. - interrupts: should contain G-Scaler interrupt number +Optional properties: +- samsung,sysreg: handle to syscon used to control the system registers to + set writeback input and destination + Example: gsc_0: gsc@0x13e00000 { diff --git a/Documentation/devicetree/bindings/media/i2c/adp1653.txt b/Documentation/devicetree/bindings/media/i2c/adp1653.txt index 5ce66f210..4cce0de40 100644 --- a/Documentation/devicetree/bindings/media/i2c/adp1653.txt +++ b/Documentation/devicetree/bindings/media/i2c/adp1653.txt @@ -12,12 +12,13 @@ There are two LED outputs available - flash and indicator. One LED is represented by one child node, nodes need to be named "flash" and "indicator". Required properties of the LED child node: -- max-microamp : see Documentation/devicetree/bindings/leds/common.txt +- led-max-microamp : see Documentation/devicetree/bindings/leds/common.txt Required properties of the flash LED child node: - flash-max-microamp : see Documentation/devicetree/bindings/leds/common.txt - flash-timeout-us : see Documentation/devicetree/bindings/leds/common.txt +- led-max-microamp : see Documentation/devicetree/bindings/leds/common.txt Example: @@ -29,9 +30,9 @@ Example: flash { flash-timeout-us = <500000>; flash-max-microamp = <320000>; - max-microamp = <50000>; + led-max-microamp = <50000>; }; indicator { - max-microamp = <17500>; + led-max-microamp = <17500>; }; }; diff --git a/Documentation/devicetree/bindings/media/stih407-c8sectpfe.txt b/Documentation/devicetree/bindings/media/stih407-c8sectpfe.txt index d4def767b..cc51b1fd6 100644 --- a/Documentation/devicetree/bindings/media/stih407-c8sectpfe.txt +++ b/Documentation/devicetree/bindings/media/stih407-c8sectpfe.txt @@ -35,7 +35,7 @@ Required properties (tsin (child) node): - tsin-num : tsin id of the InputBlock (must be between 0 to 6) - i2c-bus : phandle to the I2C bus DT node which the demodulators & tuners on this tsin channel are connected. -- rst-gpio : reset gpio for this tsin channel. +- reset-gpios : reset gpio for this tsin channel. Optional properties (tsin (child) node): @@ -55,27 +55,27 @@ Example: status = "okay"; reg = <0x08a20000 0x10000>, <0x08a00000 0x4000>; reg-names = "stfe", "stfe-ram"; - interrupts = <0 34 0>, <0 35 0>; + interrupts = , ; interrupt-names = "stfe-error-irq", "stfe-idle-irq"; - - pinctrl-names = "tsin0-serial", "tsin0-parallel", "tsin3-serial", - "tsin4-serial", "tsin5-serial"; - pinctrl-0 = <&pinctrl_tsin0_serial>; pinctrl-1 = <&pinctrl_tsin0_parallel>; pinctrl-2 = <&pinctrl_tsin3_serial>; pinctrl-3 = <&pinctrl_tsin4_serial_alt3>; pinctrl-4 = <&pinctrl_tsin5_serial_alt1>; - + pinctrl-names = "tsin0-serial", + "tsin0-parallel", + "tsin3-serial", + "tsin4-serial", + "tsin5-serial"; clocks = <&clk_s_c0_flexgen CLK_PROC_STFE>; - clock-names = "stfe"; + clock-names = "c8sectpfe"; /* tsin0 is TSA on NIMA */ tsin0: port@0 { tsin-num = <0>; serial-not-parallel; i2c-bus = <&ssc2>; - rst-gpio = <&pio15 4 0>; + reset-gpios = <&pio15 4 GPIO_ACTIVE_HIGH>; dvb-card = ; }; @@ -83,7 +83,7 @@ Example: tsin-num = <3>; serial-not-parallel; i2c-bus = <&ssc3>; - rst-gpio = <&pio15 7 0>; + reset-gpios = <&pio15 7 GPIO_ACTIVE_HIGH>; dvb-card = ; }; }; diff --git a/Documentation/devicetree/bindings/memory-controllers/ath79-ddr-controller.txt b/Documentation/devicetree/bindings/memory-controllers/ath79-ddr-controller.txt index efe35a065..c81af75bc 100644 --- a/Documentation/devicetree/bindings/memory-controllers/ath79-ddr-controller.txt +++ b/Documentation/devicetree/bindings/memory-controllers/ath79-ddr-controller.txt @@ -1,6 +1,6 @@ Binding for Qualcomm Atheros AR7xxx/AR9xxx DDR controller -The DDR controller of the ARxxx and AR9xxx families provides an interface +The DDR controller of the AR7xxx and AR9xxx families provides an interface to flush the FIFO between various devices and the DDR. This is mainly used by the IRQ controller to flush the FIFO before running the interrupt handler of such devices. @@ -11,9 +11,9 @@ Required properties: "qca,[ar7100|ar7240]-ddr-controller" as fallback. On SoC with PCI support "qca,ar7100-ddr-controller" should be used as fallback, otherwise "qca,ar7240-ddr-controller" should be used. -- reg: Base address and size of the controllers memory area -- #qca,ddr-wb-channel-cells: has to be 1, the index of the write buffer - channel +- reg: Base address and size of the controller's memory area +- #qca,ddr-wb-channel-cells: Specifies the number of cells needed to encode + the write buffer channel index, should be 1. Example: diff --git a/Documentation/devicetree/bindings/mfd/arizona.txt b/Documentation/devicetree/bindings/mfd/arizona.txt index 18be0cbfb..9b30011ec 100644 --- a/Documentation/devicetree/bindings/mfd/arizona.txt +++ b/Documentation/devicetree/bindings/mfd/arizona.txt @@ -1,4 +1,4 @@ -Wolfson Arizona class audio SoCs +Cirrus Logic/Wolfson Microelectronics Arizona class audio SoCs These devices are audio SoCs with extensive digital capabilites and a range of analogue I/O. @@ -6,12 +6,14 @@ of analogue I/O. Required properties: - compatible : One of the following chip-specific strings: + "cirrus,cs47l24" "wlf,wm5102" "wlf,wm5110" "wlf,wm8280" "wlf,wm8997" "wlf,wm8998" "wlf,wm1814" + "wlf,wm1831" - reg : I2C slave address when connected using I2C, chip select number when using SPI. @@ -24,7 +26,7 @@ Required properties: - #interrupt-cells: the number of cells to describe an IRQ, this should be 2. The first cell is the IRQ number. The second cell is the flags, encoded as the trigger masks from - Documentation/devicetree/bindings/interrupts.txt + Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - gpio-controller : Indicates this device is a GPIO controller. - #gpio-cells : Must be 2. The first cell is the pin number and the @@ -41,10 +43,21 @@ Required properties: - SPKVDD-supply : Speaker driver power supply (wm8997) + - DCVDD-supply : Main power supply (cs47l24, wm1831) + + - MICVDD-supply : Microphone power supply (cs47l24, wm1831) + Optional properties: - wlf,reset : GPIO specifier for the GPIO controlling /RESET + - clocks: Should reference the clocks supplied on MCLK1 and MCLK2 + - clock-names: Should contains two strings: + "mclk1" for the clock supplied on MCLK1, recommended to be a high + quality audio reference clock + "mclk2" for the clock supplied on MCLK2, recommended to be an always on + 32k clock + - wlf,gpio-defaults : A list of GPIO configuration register values. Defines for the appropriate values can found in . If absent, no configuration of these registers is performed. If any entry has @@ -59,6 +72,12 @@ Optional properties: that have not been specified are set to 0 by default. Entries are: (wm5102, wm5110, wm8280, wm8997) (wm8998, wm1814) + - wlf,out-mono : A list of boolean values indicating whether each output is + mono or stereo. Position within the list indicates the output affected + (eg. First entry in the list corresponds to output 1). A non-zero value + indicates a mono output. If present, the number of values should be less + than or equal to the number of outputs, if less values are supplied the + additional outputs will be treated as stereo. - wlf,dmic-ref : DMIC reference voltage source for each input, can be selected from either MICVDD or one of the MICBIAS's, defines @@ -69,6 +88,7 @@ Optional properties: - DCVDD-supply, MICVDD-supply : Power supplies, only need to be specified if they are being externally supplied. As covered in Documentation/devicetree/bindings/regulator/regulator.txt + (wm5102, wm5110, wm8280, wm8997, wm8998, wm1814) Also see child specific device properties: Regulator - ../regulator/arizona-regulator.txt diff --git a/Documentation/devicetree/bindings/mfd/palmas.txt b/Documentation/devicetree/bindings/mfd/palmas.txt index eda898978..8ae1a32bf 100644 --- a/Documentation/devicetree/bindings/mfd/palmas.txt +++ b/Documentation/devicetree/bindings/mfd/palmas.txt @@ -24,7 +24,7 @@ and also the generic series names - #interrupt-cells : should be set to 2 for IRQ number and flags The first cell is the IRQ number. The second cell is the flags, encoded as the trigger masks from - Documentation/devicetree/bindings/interrupts.txt + Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - interrupt-parent : The parent interrupt controller. Optional properties: diff --git a/Documentation/devicetree/bindings/mfd/s2mpa01.txt b/Documentation/devicetree/bindings/mfd/s2mpa01.txt deleted file mode 100644 index c13d3d8c3..000000000 --- a/Documentation/devicetree/bindings/mfd/s2mpa01.txt +++ /dev/null @@ -1,90 +0,0 @@ - -* Samsung S2MPA01 Voltage and Current Regulator - -The Samsung S2MPA01 is a multi-function device which includes high -efficiency buck converters including Dual-Phase buck converter, various LDOs, -and an RTC. It is interfaced to the host controller using an I2C interface. -Each sub-block is addressed by the host system using different I2C slave -addresses. - -Required properties: -- compatible: Should be "samsung,s2mpa01-pmic". -- reg: Specifies the I2C slave address of the PMIC block. It should be 0x66. - -Optional properties: -- interrupt-parent: Specifies the phandle of the interrupt controller to which - the interrupts from s2mpa01 are delivered to. -- interrupts: An interrupt specifier for the sole interrupt generated by the - device. - -Optional nodes: -- regulators: The regulators of s2mpa01 that have to be instantiated should be - included in a sub-node named 'regulators'. Regulator nodes and constraints - included in this sub-node use the standard regulator bindings which are - documented elsewhere. - -Properties for BUCK regulator nodes: -- regulator-ramp-delay: ramp delay in uV/us. May be 6250, 12500 - (default), 25000, or 50000. May be 0 for disabling the ramp delay on - BUCK{1,2,3,4}. - - In the absence of the regulator-ramp-delay property, the default ramp - delay will be used. - - NOTE: Some BUCKs share the ramp rate setting i.e. same ramp value will be set - for a particular group of BUCKs. So provide same regulator-ramp-delay=. - - The following BUCKs share ramp settings: - * 1 and 6 - * 2 and 4 - * 8, 9, and 10 - -The following are the names of the regulators that the s2mpa01 PMIC block -supports. Note: The 'n' in LDOn and BUCKn represents the LDO or BUCK number -as per the datasheet of s2mpa01. - - - LDOn - - valid values for n are 1 to 26 - - Example: LDO1, LD02, LDO26 - - BUCKn - - valid values for n are 1 to 10. - - Example: BUCK1, BUCK2, BUCK9 - -Example: - - s2mpa01_pmic@66 { - compatible = "samsung,s2mpa01-pmic"; - reg = <0x66>; - - regulators { - ldo1_reg: LDO1 { - regulator-name = "VDD_ALIVE"; - regulator-min-microvolt = <1000000>; - regulator-max-microvolt = <1000000>; - }; - - ldo2_reg: LDO2 { - regulator-name = "VDDQ_MMC2"; - regulator-min-microvolt = <2800000>; - regulator-max-microvolt = <2800000>; - regulator-always-on; - }; - - buck1_reg: BUCK1 { - regulator-name = "vdd_mif"; - regulator-min-microvolt = <950000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - regulator-boot-on; - }; - - buck2_reg: BUCK2 { - regulator-name = "vdd_arm"; - regulator-min-microvolt = <950000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - regulator-boot-on; - regulator-ramp-delay = <50000>; - }; - }; - }; diff --git a/Documentation/devicetree/bindings/mfd/s2mps11.txt b/Documentation/devicetree/bindings/mfd/s2mps11.txt deleted file mode 100644 index 09b94c97f..000000000 --- a/Documentation/devicetree/bindings/mfd/s2mps11.txt +++ /dev/null @@ -1,153 +0,0 @@ - -* Samsung S2MPS11/13/14/15 and S2MPU02 Voltage and Current Regulator - -The Samsung S2MPS11 is a multi-function device which includes voltage and -current regulators, RTC, charger controller and other sub-blocks. It is -interfaced to the host controller using an I2C interface. Each sub-block is -addressed by the host system using different I2C slave addresses. - -Required properties: -- compatible: Should be one of the following - - "samsung,s2mps11-pmic" - - "samsung,s2mps13-pmic" - - "samsung,s2mps14-pmic" - - "samsung,s2mps15-pmic" - - "samsung,s2mpu02-pmic". -- reg: Specifies the I2C slave address of the pmic block. It should be 0x66. - -Optional properties: -- interrupt-parent: Specifies the phandle of the interrupt controller to which - the interrupts from s2mps11 are delivered to. -- interrupts: Interrupt specifiers for interrupt sources. -- samsung,s2mps11-wrstbi-ground: Indicates that WRSTBI pin of PMIC is pulled - down. When the system is suspended it will always go down thus triggerring - unwanted buck warm reset (setting buck voltages to default values). -- samsung,s2mps11-acokb-ground: Indicates that ACOKB pin of S2MPS11 PMIC is - connected to the ground so the PMIC must manually set PWRHOLD bit in CTRL1 - register to turn off the power. Usually the ACOKB is pulled up to VBATT so - when PWRHOLD pin goes low, the rising ACOKB will trigger power off. - -Optional nodes: -- clocks: s2mps11, s2mps13, s2mps15 and s5m8767 provide three(AP/CP/BT) buffered 32.768 - KHz outputs, so to register these as clocks with common clock framework - instantiate a sub-node named "clocks". It uses the common clock binding - documented in : - [Documentation/devicetree/bindings/clock/clock-bindings.txt] - The s2mps14 provides two (AP/BT) buffered 32.768 KHz outputs. - - #clock-cells: should be 1. - - - The following is the list of clocks generated by the controller. Each clock - is assigned an identifier and client nodes use this identifier to specify - the clock which they consume. - Clock ID Devices - ---------------------------------------------------------- - 32KhzAP 0 S2MPS11, S2MPS13, S2MPS14, S2MPS15, S5M8767 - 32KhzCP 1 S2MPS11, S2MPS13, S2MPS15, S5M8767 - 32KhzBT 2 S2MPS11, S2MPS13, S2MPS14, S2MPS15, S5M8767 - - - compatible: Should be one of: "samsung,s2mps11-clk", "samsung,s2mps13-clk", - "samsung,s2mps14-clk", "samsung,s5m8767-clk" - The s2msp15 uses the same compatible as s2mps13, as both provides similar clocks. - -- regulators: The regulators of s2mps11 that have to be instantiated should be -included in a sub-node named 'regulators'. Regulator nodes included in this -sub-node should be of the format as listed below. - - regulator_name { - [standard regulator constraints....]; - }; - - regulator-ramp-delay for BUCKs = [6250/12500/25000(default)/50000] uV/us - - BUCK[2/3/4/6] supports disabling ramp delay on hardware, so explicitly - regulator-ramp-delay = <0> can be used for them to disable ramp delay. - In the absence of the regulator-ramp-delay property, the default ramp - delay will be used. - -NOTE: Some BUCKs share the ramp rate setting i.e. same ramp value will be set -for a particular group of BUCKs. So provide same regulator-ramp-delay. -Grouping of BUCKs sharing ramp rate setting is as follow : BUCK[1, 6], -BUCK[3, 4], and BUCK[7, 8, 10] - -On S2MPS14 the LDO10, LDO11 and LDO12 can be configured to external control -over GPIO. To turn this feature on this property must be added to the regulator -sub-node: - - samsung,ext-control-gpios: GPIO specifier for one GPIO - controlling this regulator (enable/disable); -Example: - LDO12 { - regulator-name = "V_EMMC_2.8V"; - regulator-min-microvolt = <2800000>; - regulator-max-microvolt = <2800000>; - samsung,ext-control-gpios = <&gpk0 2 0>; - }; - - -The regulator constraints inside the regulator nodes use the standard regulator -bindings which are documented elsewhere. - -The following are the names of the regulators that the s2mps11 pmic block -supports. Note: The 'n' in LDOn and BUCKn represents the LDO or BUCK number -as per the datasheet of s2mps11. - - - LDOn - - valid values for n are: - - S2MPS11: 1 to 38 - - S2MPS13: 1 to 40 - - S2MPS14: 1 to 25 - - S2MPS15: 1 to 27 - - S2MPU02: 1 to 28 - - Example: LDO1, LDO2, LDO28 - - BUCKn - - valid values for n are: - - S2MPS11: 1 to 10 - - S2MPS13: 1 to 10 - - S2MPS14: 1 to 5 - - S2MPS15: 1 to 10 - - S2MPU02: 1 to 7 - - Example: BUCK1, BUCK2, BUCK9 - -Example: - - s2mps11_pmic@66 { - compatible = "samsung,s2mps11-pmic"; - reg = <0x66>; - - s2m_osc: clocks { - compatible = "samsung,s2mps11-clk"; - #clock-cells = <1>; - clock-output-names = "xx", "yy", "zz"; - }; - - regulators { - ldo1_reg: LDO1 { - regulator-name = "VDD_ABB_3.3V"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - }; - - ldo2_reg: LDO2 { - regulator-name = "VDD_ALIVE_1.1V"; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1100000>; - regulator-always-on; - }; - - buck1_reg: BUCK1 { - regulator-name = "vdd_mif"; - regulator-min-microvolt = <950000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - regulator-boot-on; - }; - - buck2_reg: BUCK2 { - regulator-name = "vdd_arm"; - regulator-min-microvolt = <950000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - regulator-boot-on; - regulator-ramp-delay = <50000>; - }; - }; - }; diff --git a/Documentation/devicetree/bindings/mfd/samsung,sec-core.txt b/Documentation/devicetree/bindings/mfd/samsung,sec-core.txt new file mode 100644 index 000000000..cdd079bfc --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/samsung,sec-core.txt @@ -0,0 +1,88 @@ +Binding for Samsung S2M and S5M family multi-function device +============================================================ + +This is a part of device tree bindings for S2M and S5M family multi-function +devices. + +The Samsung S2MPA01, S2MPS11/13/14/15, S2MPU02 and S5M8767 is a family +of multi-function devices which include voltage and current regulators, RTC, +charger controller, clock outputs and other sub-blocks. It is interfaced +to the host controller using an I2C interface. Each sub-block is usually +addressed by the host system using different I2C slave addresses. + + +This document describes bindings for main device node. Optional sub-blocks +must be a sub-nodes to it. Bindings for them can be found in: + - bindings/regulator/samsung,s2mpa01.txt + - bindings/regulator/samsung,s2mps11.txt + - bindings/regulator/samsung,s5m8767.txt + - bindings/clock/samsung,s2mps11.txt + + +Required properties: + - compatible: Should be one of the following + - "samsung,s2mpa01-pmic", + - "samsung,s2mps11-pmic", + - "samsung,s2mps13-pmic", + - "samsung,s2mps14-pmic", + - "samsung,s2mps15-pmic", + - "samsung,s2mpu02-pmic", + - "samsung,s5m8767-pmic". + - reg: Specifies the I2C slave address of the pmic block. It should be 0x66. + +Optional properties: + - interrupt-parent: Specifies the phandle of the interrupt controller to which + the interrupts from s2mps11 are delivered to. + - interrupts: Interrupt specifiers for interrupt sources. + - samsung,s2mps11-wrstbi-ground: Indicates that WRSTBI pin of PMIC is pulled + down. When the system is suspended it will always go down thus triggerring + unwanted buck warm reset (setting buck voltages to default values). + - samsung,s2mps11-acokb-ground: Indicates that ACOKB pin of S2MPS11 PMIC is + connected to the ground so the PMIC must manually set PWRHOLD bit in CTRL1 + register to turn off the power. Usually the ACOKB is pulled up to VBATT so + when PWRHOLD pin goes low, the rising ACOKB will trigger power off. + +Example: + + s2mps11_pmic@66 { + compatible = "samsung,s2mps11-pmic"; + reg = <0x66>; + + s2m_osc: clocks { + compatible = "samsung,s2mps11-clk"; + #clock-cells = <1>; + clock-output-names = "xx", "yy", "zz"; + }; + + regulators { + ldo1_reg: LDO1 { + regulator-name = "VDD_ABB_3.3V"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; + + ldo2_reg: LDO2 { + regulator-name = "VDD_ALIVE_1.1V"; + regulator-min-microvolt = <1100000>; + regulator-max-microvolt = <1100000>; + regulator-always-on; + }; + + buck1_reg: BUCK1 { + regulator-name = "vdd_mif"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + }; + + buck2_reg: BUCK2 { + regulator-name = "vdd_arm"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + regulator-ramp-delay = <50000>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mfd/syscon.txt b/Documentation/devicetree/bindings/mfd/syscon.txt index fe8150bb3..408f76868 100644 --- a/Documentation/devicetree/bindings/mfd/syscon.txt +++ b/Documentation/devicetree/bindings/mfd/syscon.txt @@ -13,6 +13,10 @@ Required properties: - compatible: Should contain "syscon". - reg: the register region can be accessed from syscon +Optional property: +- reg-io-width: the size (in bytes) of the IO accesses that should be + performed on the device. + Examples: gpr: iomuxc-gpr@020e0000 { compatible = "fsl,imx6q-iomuxc-gpr", "syscon"; diff --git a/Documentation/devicetree/bindings/mips/pic32/microchip,pic32mzda.txt b/Documentation/devicetree/bindings/mips/pic32/microchip,pic32mzda.txt new file mode 100644 index 000000000..1c8dbc45f --- /dev/null +++ b/Documentation/devicetree/bindings/mips/pic32/microchip,pic32mzda.txt @@ -0,0 +1,31 @@ +* Microchip PIC32MZDA Platforms + +PIC32MZDA Starter Kit +Required root node properties: + - compatible = "microchip,pic32mzda-sk", "microchip,pic32mzda" + +CPU nodes: +---------- +A "cpus" node is required. Required properties: + - #address-cells: Must be 1. + - #size-cells: Must be 0. +A CPU sub-node is also required. Required properties: + - device_type: Must be "cpu". + - compatible: Must be "mti,mips14KEc". +Example: + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + device_type = "cpu"; + compatible = "mti,mips14KEc"; + }; + }; + +Boot protocol +-------------- +In accordance with Unified Hosting Interface Reference Manual (MD01069), the +bootloader must pass the following arguments to the kernel: + - $a0: -2. + - $a1: KSEG0 address of the flattened device-tree blob. diff --git a/Documentation/devicetree/bindings/misc/sram.txt b/Documentation/devicetree/bindings/misc/sram.txt deleted file mode 100644 index 42ee9438b..000000000 --- a/Documentation/devicetree/bindings/misc/sram.txt +++ /dev/null @@ -1,67 +0,0 @@ -Generic on-chip SRAM - -Simple IO memory regions to be managed by the genalloc API. - -Required properties: - -- compatible : mmio-sram - -- reg : SRAM iomem address range - -Reserving sram areas: ---------------------- - -Each child of the sram node specifies a region of reserved memory. Each -child node should use a 'reg' property to specify a specific range of -reserved memory. - -Following the generic-names recommended practice, node names should -reflect the purpose of the node. Unit address (@

) should be -appended to the name. - -Required properties in the sram node: - -- #address-cells, #size-cells : should use the same values as the root node -- ranges : standard definition, should translate from local addresses - within the sram to bus addresses - -Required properties in the area nodes: - -- reg : iomem address range, relative to the SRAM range - -Optional properties in the area nodes: - -- compatible : standard definition, should contain a vendor specific string - in the form ,[-] -- pool : indicates that the particular reserved SRAM area is addressable - and in use by another device or devices -- export : indicates that the reserved SRAM area may be accessed outside - of the kernel, e.g. by bootloader or userspace -- label : the name for the reserved partition, if omitted, the label - is taken from the node name excluding the unit address. - -Example: - -sram: sram@5c000000 { - compatible = "mmio-sram"; - reg = <0x5c000000 0x40000>; /* 256 KiB SRAM at address 0x5c000000 */ - - #adress-cells = <1>; - #size-cells = <1>; - ranges = <0 0x5c000000 0x40000>; - - smp-sram@100 { - compatible = "socvendor,smp-sram"; - reg = <0x100 0x50>; - }; - - device-sram@1000 { - reg = <0x1000 0x1000>; - pool; - }; - - exported@20000 { - reg = <0x20000 0x20000>; - export; - }; -}; diff --git a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt index cae29eb57..ff611fa66 100644 --- a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt +++ b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt @@ -11,6 +11,7 @@ Required properties: - "renesas,mmcif-r8a7740" for the MMCIF found in r8a7740 SoCs - "renesas,mmcif-r8a7790" for the MMCIF found in r8a7790 SoCs - "renesas,mmcif-r8a7791" for the MMCIF found in r8a7791 SoCs + - "renesas,mmcif-r8a7793" for the MMCIF found in r8a7793 SoCs - "renesas,mmcif-r8a7794" for the MMCIF found in r8a7794 SoCs - clocks: reference to the functional clock diff --git a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt index 4ff7128ee..c2546ced9 100644 --- a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt +++ b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt @@ -45,6 +45,8 @@ Required properties: - #size-cells : <0> Optional properties: +- clock : reference to the clock for the NAND controller +- clock-names : "nand" (required for the above clock) - brcm,nand-has-wp : Some versions of this IP include a write-protect (WP) control bit. It is always available on >= v7.0. Use this property to describe the rare @@ -72,6 +74,12 @@ we define additional 'compatible' properties and associated register resources w and enable registers - reg-names: (required) "nand-int-base" + * "brcm,nand-bcm6368" + - compatible: should contain "brcm,nand-bcm", "brcm,nand-bcm6368" + - reg: (required) the 'NAND_INTR_BASE' register range, with combined status + and enable registers, and boot address registers + - reg-names: (required) "nand-int-base" + * "brcm,nand-iproc" - reg: (required) the "IDM" register range, for interrupt enable and APB bus access endianness configuration, and the "EXT" register range, @@ -148,3 +156,27 @@ nand@f0442800 { }; }; }; + +nand@10000200 { + compatible = "brcm,nand-bcm63168", "brcm,nand-bcm6368", + "brcm,brcmnand-v4.0", "brcm,brcmnand"; + reg = <0x10000200 0x180>, + <0x10000600 0x200>, + <0x100000b0 0x10>; + reg-names = "nand", "nand-cache", "nand-int-base"; + interrupt-parent = <&periph_intc>; + interrupts = <50>; + clocks = <&periph_clk 20>; + clock-names = "nand"; + + #address-cells = <1>; + #size-cells = <0>; + + nand0: nandcs@0 { + compatible = "brcm,nandcs"; + reg = <0>; + nand-on-flash-bbt; + nand-ecc-strength = <1>; + nand-ecc-step-size = <512>; + }; +}; diff --git a/Documentation/devicetree/bindings/mtd/fsl-quadspi.txt b/Documentation/devicetree/bindings/mtd/fsl-quadspi.txt index 862aa2f88..00c587b3d 100644 --- a/Documentation/devicetree/bindings/mtd/fsl-quadspi.txt +++ b/Documentation/devicetree/bindings/mtd/fsl-quadspi.txt @@ -2,7 +2,8 @@ Required properties: - compatible : Should be "fsl,vf610-qspi", "fsl,imx6sx-qspi", - "fsl,imx7d-qspi", "fsl,imx6ul-qspi" + "fsl,imx7d-qspi", "fsl,imx6ul-qspi", + "fsl,ls1021-qspi" - reg : the first contains the register location and length, the second contains the memory mapping address and length - reg-names: Should contain the reg names "QuadSPI" and "QuadSPI-memory" diff --git a/Documentation/devicetree/bindings/mtd/ingenic,jz4780-nand.txt b/Documentation/devicetree/bindings/mtd/ingenic,jz4780-nand.txt new file mode 100644 index 000000000..29ea5853c --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/ingenic,jz4780-nand.txt @@ -0,0 +1,86 @@ +* Ingenic JZ4780 NAND/BCH + +This file documents the device tree bindings for NAND flash devices on the +JZ4780. NAND devices are connected to the NEMC controller (described in +memory-controllers/ingenic,jz4780-nemc.txt), and thus NAND device nodes must +be children of the NEMC node. + +Required NAND controller device properties: +- compatible: Should be set to "ingenic,jz4780-nand". +- reg: For each bank with a NAND chip attached, should specify a bank number, + an offset of 0 and a size of 0x1000000 (i.e. the whole NEMC bank). + +Optional NAND controller device properties: +- ingenic,bch-controller: To make use of the hardware BCH controller, this + property must contain a phandle for the BCH controller node. The required + properties for this node are described below. If this is not specified, + software BCH will be used instead. + +Optional children nodes: +- Individual NAND chips are children of the NAND controller node. + +Required children node properties: +- reg: An integer ranging from 1 to 6 representing the CS line to use. + +Optional children node properties: +- nand-ecc-step-size: ECC block size in bytes. +- nand-ecc-strength: ECC strength (max number of correctable bits). +- nand-ecc-mode: String, operation mode of the NAND ecc mode. "hw" by default +- nand-on-flash-bbt: boolean to enable on flash bbt option, if not present false +- rb-gpios: GPIO specifier for the busy pin. +- wp-gpios: GPIO specifier for the write protect pin. + +Optional child node of NAND chip nodes: +- partitions: see Documentation/devicetree/bindings/mtd/partition.txt + +Example: + +nemc: nemc@13410000 { + ... + + nandc: nand-controller@1 { + compatible = "ingenic,jz4780-nand"; + reg = <1 0 0x1000000>; /* Bank 1 */ + + #address-cells = <1>; + #size-cells = <0>; + + ingenic,bch-controller = <&bch>; + + nand@1 { + reg = <1>; + + nand-ecc-step-size = <1024>; + nand-ecc-strength = <24>; + nand-ecc-mode = "hw"; + nand-on-flash-bbt; + + rb-gpios = <&gpa 20 GPIO_ACTIVE_LOW>; + wp-gpios = <&gpf 22 GPIO_ACTIVE_LOW>; + + partitions { + #address-cells = <2>; + #size-cells = <2>; + ... + } + }; + }; +}; + +The BCH controller is a separate SoC component used for error correction on +NAND devices. The following is a description of the device properties for a +BCH controller. + +Required BCH properties: +- compatible: Should be set to "ingenic,jz4780-bch". +- reg: Should specify the BCH controller registers location and length. +- clocks: Clock for the BCH controller. + +Example: + +bch: bch@134d0000 { + compatible = "ingenic,jz4780-bch"; + reg = <0x134d0000 0x10000>; + + clocks = <&cgu JZ4780_CLK_BCH>; +}; diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt index 2bee68103..2c91c03e7 100644 --- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt +++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt @@ -1,15 +1,61 @@ -* MTD SPI driver for ST M25Pxx (and similar) serial flash chips +* SPI NOR flash: ST M25Pxx (and similar) serial flash chips Required properties: - #address-cells, #size-cells : Must be present if the device has sub-nodes representing partitions. - compatible : May include a device-specific string consisting of the - manufacturer and name of the chip. Bear in mind the DT binding - is not Linux-only, but in case of Linux, see the "m25p_ids" - table in drivers/mtd/devices/m25p80.c for the list of supported - chips. + manufacturer and name of the chip. A list of supported chip + names follows. Must also include "jedec,spi-nor" for any SPI NOR flash that can be identified by the JEDEC READ ID opcode (0x9F). + + Supported chip names: + at25df321a + at25df641 + at26df081a + mr25h256 + mx25l4005a + mx25l1606e + mx25l6405d + mx25l12805d + mx25l25635e + n25q064 + n25q128a11 + n25q128a13 + n25q512a + s25fl256s1 + s25fl512s + s25sl12801 + s25fl008k + s25fl064k + sst25vf040b + m25p40 + m25p80 + m25p16 + m25p32 + m25p64 + m25p128 + w25x80 + w25x32 + w25q32 + w25q32dw + w25q80bl + w25q128 + w25q256 + + The following chip names have been used historically to + designate quirky versions of flash chips that do not support the + JEDEC READ ID opcode (0x9F): + m25p05-nonjedec + m25p10-nonjedec + m25p20-nonjedec + m25p40-nonjedec + m25p80-nonjedec + m25p16-nonjedec + m25p32-nonjedec + m25p64-nonjedec + m25p128-nonjedec + - reg : Chip-Select number - spi-max-frequency : Maximum frequency of the SPI bus the chip can operate at diff --git a/Documentation/devicetree/bindings/mtd/mtk-quadspi.txt b/Documentation/devicetree/bindings/mtd/mtk-quadspi.txt new file mode 100644 index 000000000..fb314f098 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/mtk-quadspi.txt @@ -0,0 +1,41 @@ +* Serial NOR flash controller for MTK MT81xx (and similar) + +Required properties: +- compatible: should be "mediatek,mt8173-nor"; +- reg: physical base address and length of the controller's register +- clocks: the phandle of the clocks needed by the nor controller +- clock-names: the names of the clocks + the clocks should be named "spi" and "sf". "spi" is used for spi bus, + and "sf" is used for controller, these are the clocks witch + hardware needs to enabling nor flash and nor flash controller. + See Documentation/devicetree/bindings/clock/clock-bindings.txt for details. +- #address-cells: should be <1> +- #size-cells: should be <0> + +The SPI flash must be a child of the nor_flash node and must have a +compatible property. Also see jedec,spi-nor.txt. + +Required properties: +- compatible: May include a device-specific string consisting of the manufacturer + and name of the chip. Must also include "jedec,spi-nor" for any + SPI NOR flash that can be identified by the JEDEC READ ID opcode (0x9F). +- reg : Chip-Select number + +Example: + +nor_flash: spi@1100d000 { + compatible = "mediatek,mt8173-nor"; + reg = <0 0x1100d000 0 0xe0>; + clocks = <&pericfg CLK_PERI_SPI>, + <&topckgen CLK_TOP_SPINFI_IFR_SEL>; + clock-names = "spi", "sf"; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + + flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + }; +}; + diff --git a/Documentation/devicetree/bindings/mtd/partition.txt b/Documentation/devicetree/bindings/mtd/partition.txt index 1c63e4065..81a224da6 100644 --- a/Documentation/devicetree/bindings/mtd/partition.txt +++ b/Documentation/devicetree/bindings/mtd/partition.txt @@ -32,6 +32,8 @@ Optional properties: partition should only be mounted read-only. This is usually used for flash partitions containing early-boot firmware images or data which should not be clobbered. +- lock : Do not unlock the partition at initialization time (not supported on + all devices) Examples: diff --git a/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt b/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt index 451fef26b..10587bdad 100644 --- a/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt +++ b/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt @@ -68,7 +68,7 @@ ethernet@f0b60000 { phy1: ethernet-phy@1 { max-speed = <1000>; reg = <0x1>; - compatible = "brcm,28nm-gphy", "ethernet-phy-ieee802.3-c22"; + compatible = "ethernet-phy-ieee802.3-c22"; }; }; }; @@ -115,7 +115,7 @@ ethernet@f0ba0000 { phy0: ethernet-phy@0 { max-speed = <1000>; reg = <0x0>; - compatible = "brcm,bcm53125", "ethernet-phy-ieee802.3-c22"; + compatible = "ethernet-phy-ieee802.3-c22"; }; }; }; diff --git a/Documentation/devicetree/bindings/net/cdns-emac.txt b/Documentation/devicetree/bindings/net/cdns-emac.txt deleted file mode 100644 index 4451ee973..000000000 --- a/Documentation/devicetree/bindings/net/cdns-emac.txt +++ /dev/null @@ -1,20 +0,0 @@ -* Cadence EMAC Ethernet controller - -Required properties: -- compatible: Should be "cdns,[-]{emac}" - Use "cdns,at91rm9200-emac" Atmel at91rm9200 SoC. - Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC. - Or the generic form: "cdns,emac". -- reg: Address and length of the register set for the device -- interrupts: Should contain macb interrupt -- phy-mode: see ethernet.txt file in the same directory. - -Examples: - - macb0: ethernet@fffc4000 { - compatible = "cdns,at91rm9200-emac"; - reg = <0xfffc4000 0x4000>; - interrupts = <21>; - phy-mode = "rmii"; - local-mac-address = [3a 0e 03 04 05 06]; - }; diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index 04e6bef3a..5fdbbcdf8 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -31,6 +31,8 @@ A switch child node has the following optional property: switch. Must be set if the switch can not detect the presence and/or size of a connected EEPROM, otherwise optional. +- reset-gpios : phandle and specifier to a gpio line connected to + reset pin of the switch chip. A switch may have multiple "port" children nodes @@ -114,6 +116,7 @@ Example: #size-cells = <0>; reg = <17 1>; /* MDIO address 17, switch 1 in tree */ mii-bus = <&mii_bus1>; + reset-gpios = <&gpio5 1 GPIO_ACTIVE_LOW>; switch1port0: port@0 { reg = <0>; diff --git a/Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt b/Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt index 80411b2f0..ecacfa44b 100644 --- a/Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt +++ b/Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt @@ -4,8 +4,6 @@ Required properties: - compatible: should be "hisilicon,hns-dsaf-v1" or "hisilicon,hns-dsaf-v2". "hisilicon,hns-dsaf-v1" is for hip05. "hisilicon,hns-dsaf-v2" is for Hi1610 and Hi1612. -- dsa-name: dsa fabric name who provide this interface. - should be "dsafX", X is the dsaf id. - mode: dsa fabric mode string. only support one of dsaf modes like these: "2port-64vf", "6port-16rss", @@ -26,9 +24,8 @@ Required properties: Example: -dsa: dsa@c7000000 { +dsaf0: dsa@c7000000 { compatible = "hisilicon,hns-dsaf-v1"; - dsa_name = "dsaf0"; mode = "6port-16rss"; interrupt-parent = <&mbigen_dsa>; reg = <0x0 0xC0000000 0x0 0x420000 diff --git a/Documentation/devicetree/bindings/net/hisilicon-hns-mdio.txt b/Documentation/devicetree/bindings/net/hisilicon-hns-mdio.txt index 9c23fdf25..4a7ede965 100644 --- a/Documentation/devicetree/bindings/net/hisilicon-hns-mdio.txt +++ b/Documentation/devicetree/bindings/net/hisilicon-hns-mdio.txt @@ -1,7 +1,12 @@ Hisilicon MDIO bus controller Properties: -- compatible: "hisilicon,mdio","hisilicon,hns-mdio". +- compatible: can be one of: + "hisilicon,hns-mdio" + "hisilicon,mdio" + "hisilicon,hns-mdio" is recommended to be used for hip05 and later SOCs, + while "hisilicon,mdio" is optional for backwards compatibility only on + hip04 Soc. - reg: The base address of the MDIO bus controller register bank. - #address-cells: Must be <1>. - #size-cells: Must be <0>. MDIO addresses have no size component. diff --git a/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt b/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt index 41d19be70..e6a9d1c30 100644 --- a/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt +++ b/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt @@ -4,8 +4,9 @@ Required properties: - compatible: "hisilicon,hns-nic-v1" or "hisilicon,hns-nic-v2". "hisilicon,hns-nic-v1" is for hip05. "hisilicon,hns-nic-v2" is for Hi1610 and Hi1612. -- ae-name: accelerator name who provides this interface, - is simply a name referring to the name of name in the accelerator node. +- ae-handle: accelerator engine handle for hns, + specifies a reference to the associating hardware driver node. + see Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt - port-id: is the index of port provided by DSAF (the accelerator). DSAF can connect to 8 PHYs. Port 0 to 1 are both used for adminstration purpose. They are called debug ports. @@ -41,7 +42,7 @@ Example: ethernet@0{ compatible = "hisilicon,hns-nic-v1"; - ae-name = "dsaf0"; + ae-handle = <&dsaf0>; port-id = <0>; local-mac-address = [a2 14 e4 4b 56 76]; }; diff --git a/Documentation/devicetree/bindings/net/ieee802154/adf7242.txt b/Documentation/devicetree/bindings/net/ieee802154/adf7242.txt new file mode 100644 index 000000000..dea5124cd --- /dev/null +++ b/Documentation/devicetree/bindings/net/ieee802154/adf7242.txt @@ -0,0 +1,18 @@ +* ADF7242 IEEE 802.15.4 * + +Required properties: + - compatible: should be "adi,adf7242" + - spi-max-frequency: maximal bus speed (12.5 MHz) + - reg: the chipselect index + - interrupts: the interrupt generated by the device via pin IRQ1. + IRQ_TYPE_LEVEL_HIGH (4) or IRQ_TYPE_EDGE_FALLING (1) + +Example: + + adf7242@0 { + compatible = "adi,adf7242"; + spi-max-frequency = <10000000>; + reg = <0>; + interrupts = <98 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&gpio3>; + }; diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index b5d79761a..d2e243b1e 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -2,15 +2,19 @@ Required properties: - compatible: Should be "cdns,[-]{macb|gem}" + Use "cdns,at91rm9200-emac" Atmel at91rm9200 SoC. Use "cdns,at91sam9260-macb" for Atmel at91sam9 SoCs or the 10/100Mbit IP available on sama5d3 SoCs. + Use "cdns,np4-macb" for NP4 SoC devices. Use "cdns,at32ap7000-macb" for other 10/100 usage or use the generic form: "cdns,macb". Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on the Cadence GEM, or the generic form: "cdns,gem". Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs. Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs. Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs. + Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC. Use "cdns,zynqmp-gem" for Zynq Ultrascale+ MPSoC. + Or the generic form: "cdns,emac". - reg: Address and length of the register set for the device - interrupts: Should contain macb interrupt - phy-mode: See ethernet.txt file in the same directory. @@ -19,6 +23,9 @@ Required properties: Optional elements: 'tx_clk' - clocks: Phandles to input clocks. +Optional properties for PHY child node: +- reset-gpios : Should specify the gpio for phy reset + Examples: macb0: ethernet@fffc4000 { @@ -29,4 +36,8 @@ Examples: local-mac-address = [3a 0e 03 04 05 06]; clock-names = "pclk", "hclk", "tx_clk"; clocks = <&clkc 30>, <&clkc 30>, <&clkc 13>; + ethernet-phy@1 { + reg = <0x1>; + reset-gpios = <&pioE 6 1>; + }; }; diff --git a/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt b/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt index aeea50c84..d0cb86939 100644 --- a/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt +++ b/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt @@ -6,12 +6,17 @@ Required properties: - interrupts: interrupt for the device - phy: See ethernet.txt file in the same directory. - phy-mode: See ethernet.txt file in the same directory -- clocks: a pointer to the reference clock for this device. +- clocks: List of clocks for this device. At least one clock is + mandatory for the core clock. If several clocks are given, then the + clock-names property must be used to identify them. Optional properties: - tx-csum-limit: maximum mtu supported by port that allow TX checksum. Value is presented in bytes. If not used, by default 1600B is set for "marvell,armada-370-neta" and 9800B for others. +- clock-names: List of names corresponding to clocks property; shall be + "core" for core clock and "bus" for the optional bus clock. + Example: diff --git a/Documentation/devicetree/bindings/net/mdio-mux-gpio.txt b/Documentation/devicetree/bindings/net/mdio-mux-gpio.txt index 79384113c..694987d3c 100644 --- a/Documentation/devicetree/bindings/net/mdio-mux-gpio.txt +++ b/Documentation/devicetree/bindings/net/mdio-mux-gpio.txt @@ -38,7 +38,6 @@ Example : phy11: ethernet-phy@1 { reg = <1>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -48,7 +47,6 @@ Example : }; phy12: ethernet-phy@2 { reg = <2>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -58,7 +56,6 @@ Example : }; phy13: ethernet-phy@3 { reg = <3>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -68,7 +65,6 @@ Example : }; phy14: ethernet-phy@4 { reg = <4>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -85,7 +81,6 @@ Example : phy21: ethernet-phy@1 { reg = <1>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -95,7 +90,6 @@ Example : }; phy22: ethernet-phy@2 { reg = <2>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -105,7 +99,6 @@ Example : }; phy23: ethernet-phy@3 { reg = <3>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -115,7 +108,6 @@ Example : }; phy24: ethernet-phy@4 { reg = <4>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, diff --git a/Documentation/devicetree/bindings/net/mdio-mux.txt b/Documentation/devicetree/bindings/net/mdio-mux.txt index f65606f8d..491f5bd55 100644 --- a/Documentation/devicetree/bindings/net/mdio-mux.txt +++ b/Documentation/devicetree/bindings/net/mdio-mux.txt @@ -47,7 +47,6 @@ Example : phy11: ethernet-phy@1 { reg = <1>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -57,7 +56,6 @@ Example : }; phy12: ethernet-phy@2 { reg = <2>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -67,7 +65,6 @@ Example : }; phy13: ethernet-phy@3 { reg = <3>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -77,7 +74,6 @@ Example : }; phy14: ethernet-phy@4 { reg = <4>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -94,7 +90,6 @@ Example : phy21: ethernet-phy@1 { reg = <1>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -104,7 +99,6 @@ Example : }; phy22: ethernet-phy@2 { reg = <2>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -114,7 +108,6 @@ Example : }; phy23: ethernet-phy@3 { reg = <3>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, @@ -124,7 +117,6 @@ Example : }; phy24: ethernet-phy@4 { reg = <4>; - compatible = "marvell,88e1149r"; marvell,reg-init = <3 0x10 0 0x5777>, <3 0x11 0 0x00aa>, <3 0x12 0 0x4105>, diff --git a/Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt b/Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt new file mode 100644 index 000000000..aa6313024 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt @@ -0,0 +1,26 @@ +Mediatek Gigabit Switch +======================= + +The mediatek gigabit switch can be found on Mediatek SoCs (mt7620, mt7621). + +Required properties: +- compatible: Should be "mediatek,mt7620-gsw" or "mediatek,mt7621-gsw" +- reg: Address and length of the register set for the device +- interrupt-parent: Should be the phandle for the interrupt controller + that services interrupts for this device +- interrupts: Should contain the gigabit switches interrupt +- resets: Should contain the gigabit switches resets +- reset-names: Should contain the reset names "gsw" + +Example: + +gsw@10110000 { + compatible = "ralink,mt7620-gsw"; + reg = <0x10110000 8000>; + + resets = <&rstctrl 23>; + reset-names = "gsw"; + + interrupt-parent = <&intc>; + interrupts = <17>; +}; diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt index 692076fda..f9c32adab 100644 --- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt +++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt @@ -1,8 +1,9 @@ Micrel KSZ9021/KSZ9031 Gigabit Ethernet PHY -Some boards require special tuning values, particularly when it comes to -clock delays. You can specify clock delay values by adding -micrel-specific properties to an Ethernet OF device node. +Some boards require special tuning values, particularly when it comes +to clock delays. You can specify clock delay values in the PHY OF +device node. Deprecated, but still supported, these properties can +also be added to an Ethernet OF device node. Note that these settings are applied after any phy-specific fixup from phy_fixup_list (see phy_init_hw() from drivers/net/phy/phy_device.c), @@ -57,16 +58,6 @@ KSZ9031: Examples: - /* Attach to an Ethernet device with autodetected PHY */ - &enet { - rxc-skew-ps = <3000>; - rxdv-skew-ps = <0>; - txc-skew-ps = <3000>; - txen-skew-ps = <0>; - status = "okay"; - }; - - /* Attach to an explicitly-specified PHY */ mdio { phy0: ethernet-phy@0 { rxc-skew-ps = <3000>; diff --git a/Documentation/devicetree/bindings/net/nfc/st95hf.txt b/Documentation/devicetree/bindings/net/nfc/st95hf.txt new file mode 100644 index 000000000..ea3178bc9 --- /dev/null +++ b/Documentation/devicetree/bindings/net/nfc/st95hf.txt @@ -0,0 +1,50 @@ +* STMicroelectronics : NFC Transceiver ST95HF + +ST NFC Transceiver is required to attach with SPI bus. +ST95HF node should be defined in DT as SPI slave device of SPI +master with which ST95HF transceiver is physically connected. +The properties defined below are required to be the part of DT +to include ST95HF transceiver into the platform. + +Required properties: +=================== +- reg: Address of SPI slave "ST95HF transceiver" on SPI master bus. + +- compatible: should be "st,st95hf" for ST95HF NFC transceiver + +- spi-max-frequency: Max. operating SPI frequency for ST95HF + transceiver. + +- enable-gpio: GPIO line to enable ST95HF transceiver. + +- interrupt-parent : Standard way to specify the controller to which + ST95HF transceiver's interrupt is routed. + +- interrupts : Standard way to define ST95HF transceiver's out + interrupt. + +Optional property: +================= +- st95hfvin-supply : This is an optional property. It contains a + phandle to ST95HF transceiver's regulator supply node in DT. + +Example: +======= +spi@9840000 { + reg = <0x9840000 0x110>; + #address-cells = <1>; + #size-cells = <0>; + cs-gpios = <&pio0 4>; + status = "okay"; + + st95hf@0{ + reg = <0>; + compatible = "st,st95hf"; + status = "okay"; + spi-max-frequency = <1000000>; + enable-gpio = <&pio4 0>; + interrupt-parent = <&pio0>; + interrupts = <7 IRQ_TYPE_EDGE_FALLING>; + }; + +}; diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt index 525e1658f..bc1c3c8bf 100644 --- a/Documentation/devicetree/bindings/net/phy.txt +++ b/Documentation/devicetree/bindings/net/phy.txt @@ -17,8 +17,7 @@ Optional Properties: "ethernet-phy-ieee802.3-c22" or "ethernet-phy-ieee802.3-c45" for PHYs that implement IEEE802.3 clause 22 or IEEE802.3 clause 45 specifications. If neither of these are specified, the default is to - assume clause 22. The compatible list may also contain other - elements. + assume clause 22. If the phy's identifier is known then the list may contain an entry of the form: "ethernet-phy-idAAAA.BBBB" where @@ -28,6 +27,9 @@ Optional Properties: 4 hex digits. This is the chip vendor OUI bits 19:24, followed by 10 bits of a vendor specific ID. + The compatible list should not contain other values than those + listed here. + - max-speed: Maximum PHY supported speed (10, 100, 1000...) - broken-turn-around: If set, indicates the PHY device does not correctly diff --git a/Documentation/devicetree/bindings/net/ralink,rt2880-net.txt b/Documentation/devicetree/bindings/net/ralink,rt2880-net.txt new file mode 100644 index 000000000..88b095d1f --- /dev/null +++ b/Documentation/devicetree/bindings/net/ralink,rt2880-net.txt @@ -0,0 +1,61 @@ +Ralink Frame Engine Ethernet controller +======================================= + +The Ralink frame engine ethernet controller can be found on Ralink and +Mediatek SoCs (RT288x, RT3x5x, RT366x, RT388x, rt5350, mt7620, mt7621, mt76x8). + +Depending on the SoC, there is a number of ports connected to the CPU port +directly and/or via a (gigabit-)switch. + +* Ethernet controller node + +Required properties: +- compatible: Should be one of "ralink,rt2880-eth", "ralink,rt3050-eth", + "ralink,rt3050-eth", "ralink,rt3883-eth", "ralink,rt5350-eth", + "mediatek,mt7620-eth", "mediatek,mt7621-eth" +- reg: Address and length of the register set for the device +- interrupt-parent: Should be the phandle for the interrupt controller + that services interrupts for this device +- interrupts: Should contain the frame engines interrupt +- resets: Should contain the frame engines resets +- reset-names: Should contain the reset names "fe". If a switch is present + "esw" is also required. + + +* Ethernet port node + +Required properties: +- compatible: Should be "ralink,eth-port" +- reg: The number of the physical port +- phy-handle: reference to the node describing the phy + +Example: + +mdio-bus { + ... + phy0: ethernet-phy@0 { + phy-mode = "mii"; + reg = <0>; + }; +}; + +ethernet@400000 { + compatible = "ralink,rt2880-eth"; + reg = <0x00400000 10000>; + + #address-cells = <1>; + #size-cells = <0>; + + resets = <&rstctrl 18>; + reset-names = "fe"; + + interrupt-parent = <&cpuintc>; + interrupts = <5>; + + port@0 { + compatible = "ralink,eth-port"; + reg = <0>; + phy-handle = <&phy0>; + }; + +}; diff --git a/Documentation/devicetree/bindings/net/ralink,rt3050-esw.txt b/Documentation/devicetree/bindings/net/ralink,rt3050-esw.txt new file mode 100644 index 000000000..2e79bd376 --- /dev/null +++ b/Documentation/devicetree/bindings/net/ralink,rt3050-esw.txt @@ -0,0 +1,32 @@ +Ralink Fast Ethernet Embedded Switch +==================================== + +The ralink fast ethernet embedded switch can be found on Ralink and Mediatek +SoCs (RT3x5x, RT5350, MT76x8). + +Required properties: +- compatible: Should be "ralink,rt3050-esw" +- reg: Address and length of the register set for the device +- interrupt-parent: Should be the phandle for the interrupt controller + that services interrupts for this device +- interrupts: Should contain the embedded switches interrupt +- resets: Should contain the embedded switches resets +- reset-names: Should contain the reset names "esw" + +Optional properties: +- ralink,portmap: can be used to choose if the default switch setup is + llllw or wllll +- ralink,led_polarity: override the active high/low settings of the leds + +Example: + +esw@10110000 { + compatible = "ralink,rt3050-esw"; + reg = <0x10110000 8000>; + + resets = <&rstctrl 23>; + reset-names = "esw"; + + interrupt-parent = <&intc>; + interrupts = <17>; +}; diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index b486f3f5f..c8ac222ea 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -5,8 +5,18 @@ interface contains. Required properties: - compatible: "renesas,etheravb-r8a7790" if the device is a part of R8A7790 SoC. + "renesas,etheravb-r8a7791" if the device is a part of R8A7791 SoC. + "renesas,etheravb-r8a7792" if the device is a part of R8A7792 SoC. + "renesas,etheravb-r8a7793" if the device is a part of R8A7793 SoC. "renesas,etheravb-r8a7794" if the device is a part of R8A7794 SoC. "renesas,etheravb-r8a7795" if the device is a part of R8A7795 SoC. + "renesas,etheravb-rcar-gen2" for generic R-Car Gen 2 compatible interface. + "renesas,etheravb-rcar-gen3" for generic R-Car Gen 3 compatible interface. + + When compatible with the generic version, nodes must list the + SoC-specific version corresponding to the platform first + followed by the generic version. + - reg: offset and length of (1) the register block and (2) the stream buffer. - interrupts: A list of interrupt-specifiers, one for each entry in interrupt-names. @@ -37,7 +47,7 @@ Optional properties: Example: ethernet@e6800000 { - compatible = "renesas,etheravb-r8a7795"; + compatible = "renesas,etheravb-r8a7795", "renesas,etheravb-rcar-gen3"; reg = <0 0xe6800000 0 0x800>, <0 0xe6a00000 0 0x10000>; interrupt-parent = <&gic>; interrupts = , @@ -72,8 +82,8 @@ Example: "ch16", "ch17", "ch18", "ch19", "ch20", "ch21", "ch22", "ch23", "ch24"; - clocks = <&mstp8_clks R8A7795_CLK_ETHERAVB>; - power-domains = <&cpg_clocks>; + clocks = <&cpg CPG_MOD 812>; + power-domains = <&cpg>; phy-mode = "rgmii-id"; phy-handle = <&phy0>; diff --git a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt index 3a9d67951..72d82d684 100644 --- a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt +++ b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt @@ -11,6 +11,8 @@ Required properties: designware version numbers documented in stmmac.txt - altr,sysmgr-syscon : Should be the phandle to the system manager node that encompasses the glue register, the register offset, and the register shift. + - altr,f2h_ptp_ref_clk use f2h_ptp_ref_clk instead of default eosc1 clock + for ptp ref clk. This affects all emacs as the clock is common. Optional properties: altr,emac-splitter: Should be the phandle to the emac splitter soft IP node if diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt index f34fc3c81..e862a922b 100644 --- a/Documentation/devicetree/bindings/net/stmmac.txt +++ b/Documentation/devicetree/bindings/net/stmmac.txt @@ -35,18 +35,18 @@ Optional properties: - reset-names: Should contain the reset signal name "stmmaceth", if a reset phandle is given - max-frame-size: See ethernet.txt file in the same directory -- clocks: If present, the first clock should be the GMAC main clock and - the second clock should be peripheral's register interface clock. Further - clocks may be specified in derived bindings. -- clock-names: One name for each entry in the clocks property, the - first one should be "stmmaceth" and the second one should be "pclk". -- clk_ptp_ref: this is the PTP reference clock; in case of the PTP is - available this clock is used for programming the Timestamp Addend Register. - If not passed then the system clock will be used and this is fine on some - platforms. +- clocks: If present, the first clock should be the GMAC main clock + The optional second clock should be peripheral's register interface clock. + The third optional clock should be the ptp reference clock. + Further clocks may be specified in derived bindings. +- clock-names: One name for each entry in the clocks property. + The first one should be "stmmaceth". + The optional second one should be "pclk". + The optional third one should be "clk_ptp_ref". - snps,burst_len: The AXI burst lenth value of the AXI BUS MODE register. - tx-fifo-depth: See ethernet.txt file in the same directory - rx-fifo-depth: See ethernet.txt file in the same directory +- mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus. Examples: @@ -65,4 +65,11 @@ Examples: tx-fifo-depth = <16384>; clocks = <&clock>; clock-names = "stmmaceth"; + mdio0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + phy1: ethernet-phy@0 { + }; + }; }; diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt index 0cb44dc21..601256fe8 100644 --- a/Documentation/devicetree/bindings/opp/opp.txt +++ b/Documentation/devicetree/bindings/opp/opp.txt @@ -45,21 +45,10 @@ Devices supporting OPPs must set their "operating-points-v2" property with phandle to a OPP table in their DT node. The OPP core will use this phandle to find the operating points for the device. -Devices may want to choose OPP tables at runtime and so can provide a list of -phandles here. But only *one* of them should be chosen at runtime. This must be -accompanied by a corresponding "operating-points-names" property, to uniquely -identify the OPP tables. - If required, this can be extended for SoC vendor specfic bindings. Such bindings should be documented as Documentation/devicetree/bindings/power/-opp.txt and should have a compatible description like: "operating-points-v2-". -Optional properties: -- operating-points-names: Names of OPP tables (required if multiple OPP - tables are present), to uniquely identify them. The same list must be present - for all the CPUs which are sharing clock/voltage rails and hence the OPP - tables. - * OPP Table Node This describes the OPPs belonging to a device. This node can have following @@ -100,6 +89,14 @@ Optional properties: Entries for multiple regulators must be present in the same order as regulators are specified in device's DT node. +- opp-microvolt-: Named opp-microvolt property. This is exactly similar to + the above opp-microvolt property, but allows multiple voltage ranges to be + provided for the same OPP. At runtime, the platform can pick a and + matching opp-microvolt- property will be enabled for all OPPs. If the + platform doesn't pick a specific or the doesn't match with any + opp-microvolt- properties, then opp-microvolt property shall be used, if + present. + - opp-microamp: The maximum current drawn by the device in microamperes considering system specific parameters (such as transients, process, aging, maximum operating temperature range etc.) as necessary. This may be used to @@ -112,6 +109,9 @@ Optional properties: for few regulators, then this should be marked as zero for them. If it isn't required for any regulator, then this property need not be present. +- opp-microamp-: Named opp-microamp property. Similar to + opp-microvolt- property, but for microamp instead. + - clock-latency-ns: Specifies the maximum possible transition latency (in nanoseconds) for switching to this OPP from any other OPP. @@ -123,6 +123,26 @@ Optional properties: - opp-suspend: Marks the OPP to be used during device suspend. Only one OPP in the table should have this. +- opp-supported-hw: This enables us to select only a subset of OPPs from the + larger OPP table, based on what version of the hardware we are running on. We + still can't have multiple nodes with the same opp-hz value in OPP table. + + It's an user defined array containing a hierarchy of hardware version numbers, + supported by the OPP. For example: a platform with hierarchy of three levels + of versions (A, B and C), this field should be like , where X + corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z + corresponds to version hierarchy C. + + Each level of hierarchy is represented by a 32 bit value, and so there can be + only 32 different supported version per hierarchy. i.e. 1 bit per version. A + value of 0xFFFFFFFF will enable the OPP for all versions for that hierarchy + level. And a value of 0x00000000 will disable the OPP completely, and so we + never want that to happen. + + If 32 values aren't sufficient for a version hierarchy, than that version + hierarchy can be contained in multiple 32 bit values. i.e. in the + above example, Z1 & Z2 refer to the version hierarchy Z. + - status: Marks the node enabled/disabled. Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together. @@ -157,20 +177,20 @@ Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together. compatible = "operating-points-v2"; opp-shared; - opp00 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-microvolt = <970000 975000 985000>; opp-microamp = <70000>; clock-latency-ns = <300000>; opp-suspend; }; - opp01 { + opp@1100000000 { opp-hz = /bits/ 64 <1100000000>; opp-microvolt = <980000 1000000 1010000>; opp-microamp = <80000>; clock-latency-ns = <310000>; }; - opp02 { + opp@1200000000 { opp-hz = /bits/ 64 <1200000000>; opp-microvolt = <1025000>; clock-latency-ns = <290000>; @@ -236,20 +256,20 @@ independently. * independently. */ - opp00 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-microvolt = <970000 975000 985000>; opp-microamp = <70000>; clock-latency-ns = <300000>; opp-suspend; }; - opp01 { + opp@1100000000 { opp-hz = /bits/ 64 <1100000000>; opp-microvolt = <980000 1000000 1010000>; opp-microamp = <80000>; clock-latency-ns = <310000>; }; - opp02 { + opp@1200000000 { opp-hz = /bits/ 64 <1200000000>; opp-microvolt = <1025000>; opp-microamp = <90000; @@ -312,20 +332,20 @@ DVFS state together. compatible = "operating-points-v2"; opp-shared; - opp00 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-microvolt = <970000 975000 985000>; opp-microamp = <70000>; clock-latency-ns = <300000>; opp-suspend; }; - opp01 { + opp@1100000000 { opp-hz = /bits/ 64 <1100000000>; opp-microvolt = <980000 1000000 1010000>; opp-microamp = <80000>; clock-latency-ns = <310000>; }; - opp02 { + opp@1200000000 { opp-hz = /bits/ 64 <1200000000>; opp-microvolt = <1025000>; opp-microamp = <90000>; @@ -338,20 +358,20 @@ DVFS state together. compatible = "operating-points-v2"; opp-shared; - opp10 { + opp@1300000000 { opp-hz = /bits/ 64 <1300000000>; opp-microvolt = <1045000 1050000 1055000>; opp-microamp = <95000>; clock-latency-ns = <400000>; opp-suspend; }; - opp11 { + opp@1400000000 { opp-hz = /bits/ 64 <1400000000>; opp-microvolt = <1075000>; opp-microamp = <100000>; clock-latency-ns = <400000>; }; - opp12 { + opp@1500000000 { opp-hz = /bits/ 64 <1500000000>; opp-microvolt = <1010000 1100000 1110000>; opp-microamp = <95000>; @@ -378,7 +398,7 @@ Example 4: Handling multiple regulators compatible = "operating-points-v2"; opp-shared; - opp00 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-microvolt = <970000>, /* Supply 0 */ <960000>, /* Supply 1 */ @@ -391,7 +411,7 @@ Example 4: Handling multiple regulators /* OR */ - opp00 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-microvolt = <970000 975000 985000>, /* Supply 0 */ <960000 965000 975000>, /* Supply 1 */ @@ -404,7 +424,7 @@ Example 4: Handling multiple regulators /* OR */ - opp00 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-microvolt = <970000 975000 985000>, /* Supply 0 */ <960000 965000 975000>, /* Supply 1 */ @@ -417,7 +437,8 @@ Example 4: Handling multiple regulators }; }; -Example 5: Multiple OPP tables +Example 5: opp-supported-hw +(example: three level hierarchy of versions: cuts, substrate and process) / { cpus { @@ -426,40 +447,73 @@ Example 5: Multiple OPP tables ... cpu-supply = <&cpu_supply> - operating-points-v2 = <&cpu0_opp_table_slow>, <&cpu0_opp_table_fast>; - operating-points-names = "slow", "fast"; + operating-points-v2 = <&cpu0_opp_table_slow>; }; }; - cpu0_opp_table_slow: opp_table_slow { + opp_table { compatible = "operating-points-v2"; status = "okay"; opp-shared; - opp00 { + opp@600000000 { + /* + * Supports all substrate and process versions for 0xF + * cuts, i.e. only first four cuts. + */ + opp-supported-hw = <0xF 0xFFFFFFFF 0xFFFFFFFF> opp-hz = /bits/ 64 <600000000>; + opp-microvolt = <900000 915000 925000>; ... }; - opp01 { + opp@800000000 { + /* + * Supports: + * - cuts: only one, 6th cut (represented by 6th bit). + * - substrate: supports 16 different substrate versions + * - process: supports 9 different process versions + */ + opp-supported-hw = <0x20 0xff0000ff 0x0000f4f0> opp-hz = /bits/ 64 <800000000>; + opp-microvolt = <900000 915000 925000>; ... }; }; +}; + +Example 6: opp-microvolt-, opp-microamp-: +(example: device with two possible microvolt ranges: slow and fast) - cpu0_opp_table_fast: opp_table_fast { +/ { + cpus { + cpu@0 { + compatible = "arm,cortex-a7"; + ... + + operating-points-v2 = <&cpu0_opp_table>; + }; + }; + + cpu0_opp_table: opp_table0 { compatible = "operating-points-v2"; - status = "okay"; opp-shared; - opp10 { + opp@1000000000 { opp-hz = /bits/ 64 <1000000000>; - ... + opp-microvolt-slow = <900000 915000 925000>; + opp-microvolt-fast = <970000 975000 985000>; + opp-microamp-slow = <70000>; + opp-microamp-fast = <71000>; }; - opp11 { - opp-hz = /bits/ 64 <1100000000>; - ... + opp@1200000000 { + opp-hz = /bits/ 64 <1200000000>; + opp-microvolt-slow = <900000 915000 925000>, /* Supply vcc0 */ + <910000 925000 935000>; /* Supply vcc1 */ + opp-microvolt-fast = <970000 975000 985000>, /* Supply vcc0 */ + <960000 965000 975000>; /* Supply vcc1 */ + opp-microamp = <70000>; /* Will be used for both slow/fast */ }; }; }; diff --git a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt index 45c2a8094..01b88f4e0 100644 --- a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt +++ b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt @@ -1,7 +1,10 @@ * Broadcom iProc PCIe controller with the platform bus interface Required properties: -- compatible: Must be "brcm,iproc-pcie" +- compatible: Must be "brcm,iproc-pcie" for PAXB, or "brcm,iproc-pcie-paxc" + for PAXC. PAXB-based root complex is used for external endpoint devices. + PAXC-based root complex is connected to emulated endpoint devices + internal to the ASIC - reg: base address and length of the PCIe controller I/O register space - #interrupt-cells: set to <1> - interrupt-map-mask and interrupt-map, standard PCI properties to define the @@ -32,6 +35,28 @@ Optional: - brcm,pcie-ob-oarr-size: Some iProc SoCs need the OARR size bit to be set to increase the outbound window size +MSI support (optional): + +For older platforms without MSI integrated in the GIC, iProc PCIe core provides +an event queue based MSI support. The iProc MSI uses host memories to store +MSI posted writes in the event queues + +- msi-parent: Link to the device node of the MSI controller. On newer iProc +platforms, the MSI controller may be gicv2m or gicv3-its. On older iProc +platforms without MSI support in its interrupt controller, one may use the +event queue based MSI support integrated within the iProc PCIe core. + +When the iProc event queue based MSI is used, one needs to define the +following properties in the MSI device node: +- compatible: Must be "brcm,iproc-msi" +- msi-controller: claims itself as an MSI controller +- interrupt-parent: Link to its parent interrupt device +- interrupts: List of interrupt IDs from its parent interrupt device + +Optional properties: +- brcm,pcie-msi-inten: Needs to be present for some older iProc platforms that +require the interrupt enable registers to be set explicitly to enable MSI + Example: pcie0: pcie@18012000 { compatible = "brcm,iproc-pcie"; @@ -58,6 +83,19 @@ Example: brcm,pcie-ob-oarr-size; brcm,pcie-ob-axi-offset = <0x00000000>; brcm,pcie-ob-window-size = <256>; + + msi-parent = <&msi0>; + + /* iProc event queue based MSI */ + msi0: msi@18012000 { + compatible = "brcm,iproc-msi"; + msi-controller; + interrupt-parent = <&gic>; + interrupts = , + , + , + , + }; }; pcie1: pcie@18013000 { diff --git a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt index 17c6ed9c6..b721beacf 100644 --- a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt +++ b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt @@ -1,4 +1,4 @@ -HiSilicon PCIe host bridge DT description +HiSilicon Hip05 and Hip06 PCIe host bridge DT description HiSilicon PCIe host controller is based on Designware PCI core. It shares common functions with PCIe Designware core driver and inherits @@ -7,8 +7,8 @@ Documentation/devicetree/bindings/pci/designware-pci.txt. Additional properties are described here: -Required properties: -- compatible: Should contain "hisilicon,hip05-pcie". +Required properties +- compatible: Should contain "hisilicon,hip05-pcie" or "hisilicon,hip06-pcie". - reg: Should contain rc_dbi, config registers location and length. - reg-names: Must include the following entries: "rc_dbi": controller configuration registers; @@ -20,7 +20,7 @@ Optional properties: - status: Either "ok" or "disabled". - dma-coherent: Present if DMA operations are coherent. -Example: +Hip05 Example (note that Hip06 is the same except compatible): pcie@0xb0080000 { compatible = "hisilicon,hip05-pcie", "snps,dw-pcie"; reg = <0 0xb0080000 0 0x10000>, <0x220 0x00000000 0 0x2000>; diff --git a/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt b/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt index 7fab84b33..07a75094c 100644 --- a/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt +++ b/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt @@ -8,7 +8,15 @@ OHCI and EHCI controllers. Required properties: - compatible: "renesas,pci-r8a7790" for the R8A7790 SoC; "renesas,pci-r8a7791" for the R8A7791 SoC; - "renesas,pci-r8a7794" for the R8A7794 SoC. + "renesas,pci-r8a7793" for the R8A7793 SoC; + "renesas,pci-r8a7794" for the R8A7794 SoC; + "renesas,pci-rcar-gen2" for a generic R-Car Gen2 compatible device + + + When compatible with the generic version, nodes must list the + SoC-specific version corresponding to the platform first + followed by the generic version. + - reg: A list of physical regions to access the device: the first is the operational registers for the OHCI/EHCI controllers and the second is for the bridge configuration and control registers. @@ -24,10 +32,15 @@ Required properties: - interrupt-map-mask: standard property that helps to define the interrupt mapping. +Optional properties: +- dma-ranges: a single range for the inbound memory region. If not supplied, + defaults to 1GiB at 0x40000000. Note there are hardware restrictions on the + allowed combinations of address and size. + Example SoC configuration: pci0: pci@ee090000 { - compatible = "renesas,pci-r8a7790"; + compatible = "renesas,pci-r8a7790", "renesas,pci-rcar-gen2"; clocks = <&mstp7_clks R8A7790_CLK_EHCI>; reg = <0x0 0xee090000 0x0 0xc00>, <0x0 0xee080000 0x0 0x1100>; @@ -38,6 +51,7 @@ Example SoC configuration: #address-cells = <3>; #size-cells = <2>; #interrupt-cells = <1>; + dma-ranges = <0x42000000 0 0x40000000 0 0x40000000 0 0x40000000>; interrupt-map-mask = <0xff00 0 0 0x7>; interrupt-map = <0x0000 0 0 1 &gic 0 108 IRQ_TYPE_LEVEL_HIGH 0x0800 0 0 1 &gic 0 108 IRQ_TYPE_LEVEL_HIGH diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt new file mode 100644 index 000000000..4059a6f89 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt @@ -0,0 +1,233 @@ +* Qualcomm PCI express root complex + +- compatible: + Usage: required + Value type: + Definition: Value should contain + - "qcom,pcie-ipq8064" for ipq8064 + - "qcom,pcie-apq8064" for apq8064 + - "qcom,pcie-apq8084" for apq8084 + +- reg: + Usage: required + Value type: + Definition: Register ranges as listed in the reg-names property + +- reg-names: + Usage: required + Value type: + Definition: Must include the following entries + - "parf" Qualcomm specific registers + - "dbi" Designware PCIe registers + - "elbi" External local bus interface registers + - "config" PCIe configuration space + +- device_type: + Usage: required + Value type: + Definition: Should be "pci". As specified in designware-pcie.txt + +- #address-cells: + Usage: required + Value type: + Definition: Should be 3. As specified in designware-pcie.txt + +- #size-cells: + Usage: required + Value type: + Definition: Should be 2. As specified in designware-pcie.txt + +- ranges: + Usage: required + Value type: + Definition: As specified in designware-pcie.txt + +- interrupts: + Usage: required + Value type: + Definition: MSI interrupt + +- interrupt-names: + Usage: required + Value type: + Definition: Should contain "msi" + +- #interrupt-cells: + Usage: required + Value type: + Definition: Should be 1. As specified in designware-pcie.txt + +- interrupt-map-mask: + Usage: required + Value type: + Definition: As specified in designware-pcie.txt + +- interrupt-map: + Usage: required + Value type: + Definition: As specified in designware-pcie.txt + +- clocks: + Usage: required + Value type: + Definition: List of phandle and clock specifier pairs as listed + in clock-names property + +- clock-names: + Usage: required + Value type: + Definition: Should contain the following entries + - "iface" Configuration AHB clock + +- clock-names: + Usage: required for ipq/apq8064 + Value type: + Definition: Should contain the following entries + - "core" Clocks the pcie hw block + - "phy" Clocks the pcie PHY block +- clock-names: + Usage: required for apq8084 + Value type: + Definition: Should contain the following entries + - "aux" Auxiliary (AUX) clock + - "bus_master" Master AXI clock + - "bus_slave" Slave AXI clock +- resets: + Usage: required + Value type: + Definition: List of phandle and reset specifier pairs as listed + in reset-names property + +- reset-names: + Usage: required for ipq/apq8064 + Value type: + Definition: Should contain the following entries + - "axi" AXI reset + - "ahb" AHB reset + - "por" POR reset + - "pci" PCI reset + - "phy" PHY reset + +- reset-names: + Usage: required for apq8084 + Value type: + Definition: Should contain the following entries + - "core" Core reset + +- power-domains: + Usage: required for apq8084 + Value type: + Definition: A phandle and power domain specifier pair to the + power domain which is responsible for collapsing + and restoring power to the peripheral + +- vdda-supply: + Usage: required + Value type: + Definition: A phandle to the core analog power supply + +- vdda_phy-supply: + Usage: required for ipq/apq8064 + Value type: + Definition: A phandle to the analog power supply for PHY + +- vdda_refclk-supply: + Usage: required for ipq/apq8064 + Value type: + Definition: A phandle to the analog power supply for IC which generates + reference clock + +- phys: + Usage: required for apq8084 + Value type: + Definition: List of phandle(s) as listed in phy-names property + +- phy-names: + Usage: required for apq8084 + Value type: + Definition: Should contain "pciephy" + +- -gpios: + Usage: optional + Value type: + Definition: List of phandle and gpio specifier pairs. Should contain + - "perst-gpios" PCIe endpoint reset signal line + - "wake-gpios" PCIe endpoint wake signal line + +* Example for ipq/apq8064 + pcie@1b500000 { + compatible = "qcom,pcie-apq8064", "qcom,pcie-ipq8064", "snps,dw-pcie"; + reg = <0x1b500000 0x1000 + 0x1b502000 0x80 + 0x1b600000 0x100 + 0x0ff00000 0x100000>; + reg-names = "dbi", "elbi", "parf", "config"; + device_type = "pci"; + linux,pci-domain = <0>; + bus-range = <0x00 0xff>; + num-lanes = <1>; + #address-cells = <3>; + #size-cells = <2>; + ranges = <0x81000000 0 0 0x0fe00000 0 0x00100000 /* I/O */ + 0x82000000 0 0 0x08000000 0 0x07e00000>; /* memory */ + interrupts = ; + interrupt-names = "msi"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0x7>; + interrupt-map = <0 0 0 1 &intc 0 36 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ + <0 0 0 2 &intc 0 37 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ + <0 0 0 3 &intc 0 38 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ + <0 0 0 4 &intc 0 39 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ + clocks = <&gcc PCIE_A_CLK>, + <&gcc PCIE_H_CLK>, + <&gcc PCIE_PHY_CLK>; + clock-names = "core", "iface", "phy"; + resets = <&gcc PCIE_ACLK_RESET>, + <&gcc PCIE_HCLK_RESET>, + <&gcc PCIE_POR_RESET>, + <&gcc PCIE_PCI_RESET>, + <&gcc PCIE_PHY_RESET>; + reset-names = "axi", "ahb", "por", "pci", "phy"; + pinctrl-0 = <&pcie_pins_default>; + pinctrl-names = "default"; + }; + +* Example for apq8084 + pcie0@fc520000 { + compatible = "qcom,pcie-apq8084", "snps,dw-pcie"; + reg = <0xfc520000 0x2000>, + <0xff000000 0x1000>, + <0xff001000 0x1000>, + <0xff002000 0x2000>; + reg-names = "parf", "dbi", "elbi", "config"; + device_type = "pci"; + linux,pci-domain = <0>; + bus-range = <0x00 0xff>; + num-lanes = <1>; + #address-cells = <3>; + #size-cells = <2>; + ranges = <0x81000000 0 0 0xff200000 0 0x00100000 /* I/O */ + 0x82000000 0 0x00300000 0xff300000 0 0x00d00000>; /* memory */ + interrupts = ; + interrupt-names = "msi"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0x7>; + interrupt-map = <0 0 0 1 &intc 0 244 IRQ_TYPE_LEVEL_HIGH>, /* int_a */ + <0 0 0 2 &intc 0 245 IRQ_TYPE_LEVEL_HIGH>, /* int_b */ + <0 0 0 3 &intc 0 247 IRQ_TYPE_LEVEL_HIGH>, /* int_c */ + <0 0 0 4 &intc 0 248 IRQ_TYPE_LEVEL_HIGH>; /* int_d */ + clocks = <&gcc GCC_PCIE_0_CFG_AHB_CLK>, + <&gcc GCC_PCIE_0_MSTR_AXI_CLK>, + <&gcc GCC_PCIE_0_SLV_AXI_CLK>, + <&gcc GCC_PCIE_0_AUX_CLK>; + clock-names = "iface", "master_bus", "slave_bus", "aux"; + resets = <&gcc GCC_PCIE_0_BCR>; + reset-names = "core"; + power-domains = <&gcc PCIE0_GDSC>; + vdda-supply = <&pma8084_l3>; + phys = <&pciephy0>; + phy-names = "pciephy"; + perst-gpio = <&tlmm 70 GPIO_ACTIVE_LOW>; + pinctrl-0 = <&pcie0_pins_default>; + pinctrl-names = "default"; + }; diff --git a/Documentation/devicetree/bindings/pci/rcar-pci.txt b/Documentation/devicetree/bindings/pci/rcar-pci.txt index 29d3b989d..6cf99690e 100644 --- a/Documentation/devicetree/bindings/pci/rcar-pci.txt +++ b/Documentation/devicetree/bindings/pci/rcar-pci.txt @@ -1,8 +1,17 @@ * Renesas RCar PCIe interface Required properties: -- compatible: should contain one of the following - "renesas,pcie-r8a7779", "renesas,pcie-r8a7790", "renesas,pcie-r8a7791" +compatible: "renesas,pcie-r8a7779" for the R8A7779 SoC; + "renesas,pcie-r8a7790" for the R8A7790 SoC; + "renesas,pcie-r8a7791" for the R8A7791 SoC; + "renesas,pcie-r8a7793" for the R8A7793 SoC; + "renesas,pcie-r8a7795" for the R8A7795 SoC; + "renesas,pcie-rcar-gen2" for a generic R-Car Gen2 compatible device. + + When compatible with the generic version, nodes must list the + SoC-specific version corresponding to the platform first + followed by the generic version. + - reg: base address and length of the pcie controller registers. - #address-cells: set to <3> - #size-cells: set to <2> @@ -25,7 +34,7 @@ Example: SoC specific DT Entry: pcie: pcie@fe000000 { - compatible = "renesas,pcie-r8a7791"; + compatible = "renesas,pcie-r8a7791", "renesas,pcie-rcar-gen2"; reg = <0 0xfe000000 0 0x80000>; #address-cells = <3>; #size-cells = <2>; diff --git a/Documentation/devicetree/bindings/phy/brcm,brcmstb-sata-phy.txt b/Documentation/devicetree/bindings/phy/brcm,brcmstb-sata-phy.txt index 7f81ef901..d87ab7c12 100644 --- a/Documentation/devicetree/bindings/phy/brcm,brcmstb-sata-phy.txt +++ b/Documentation/devicetree/bindings/phy/brcm,brcmstb-sata-phy.txt @@ -2,6 +2,7 @@ Required properties: - compatible: should be one or more of + "brcm,bcm7425-sata-phy" "brcm,bcm7445-sata-phy" "brcm,phy-sata3" - address-cells: should be 1 diff --git a/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt b/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt new file mode 100644 index 000000000..cafe2197d --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt @@ -0,0 +1,18 @@ +* Atheros AR71XX/9XXX USB PHY + +Required properties: +- compatible: "qca,ar7100-usb-phy" +- #phys-cells: should be 0 +- reset-names: "usb-phy"[, "usb-suspend-override"] +- resets: references to the reset controllers + +Example: + + usb-phy { + compatible = "qca,ar7100-usb-phy"; + + reset-names = "usb-phy", "usb-suspend-override"; + resets = <&rst 4>, <&rst 3>; + + #phy-cells = <0>; + }; diff --git a/Documentation/devicetree/bindings/phy/phy-hi6220-usb.txt b/Documentation/devicetree/bindings/phy/phy-hi6220-usb.txt new file mode 100644 index 000000000..f17a56e21 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-hi6220-usb.txt @@ -0,0 +1,16 @@ +Hisilicon hi6220 usb PHY +----------------------- + +Required properties: +- compatible: should be "hisilicon,hi6220-usb-phy" +- #phy-cells: must be 0 +- hisilicon,peripheral-syscon: phandle of syscon used to control phy. +Refer to phy/phy-bindings.txt for the generic PHY binding properties + +Example: + usb_phy: usbphy { + compatible = "hisilicon,hi6220-usb-phy"; + #phy-cells = <0>; + phy-supply = <&fixed_5v_hub>; + hisilicon,peripheral-syscon = <&sys_ctrl>; + }; diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt new file mode 100644 index 000000000..2390e4e9c --- /dev/null +++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt @@ -0,0 +1,39 @@ +* Renesas R-Car generation 3 USB 2.0 PHY + +This file provides information on what the device node for the R-Car generation +3 USB 2.0 PHY contains. + +Required properties: +- compatible: "renesas,usb2-phy-r8a7795" if the device is a part of an R8A7795 + SoC. +- reg: offset and length of the partial USB 2.0 Host register block. +- reg-names: must be "usb2_host". +- clocks: clock phandle and specifier pair(s). +- #phy-cells: see phy-bindings.txt in the same directory, must be <0>. + +Optional properties: +To use a USB channel where USB 2.0 Host and HSUSB (USB 2.0 Peripheral) are +combined, the device tree node should set HSUSB properties to reg and reg-names +properties. This is because HSUSB has registers to select USB 2.0 host or +peripheral at that channel: +- reg: offset and length of the partial HSUSB register block. +- reg-names: must be "hsusb". +- interrupts: interrupt specifier for the PHY. + +Example (R-Car H3): + + usb-phy@ee080200 { + compatible = "renesas,usb2-phy-r8a7795"; + reg = <0 0xee080200 0 0x700>, <0 0xe6590100 0 0x100>; + reg-names = "usb2_host", "hsusb"; + interrupts = ; + clocks = <&mstp7_clks R8A7795_CLK_EHCI0>, + <&mstp7_clks R8A7795_CLK_HSUSB>; + }; + + usb-phy@ee0a0200 { + compatible = "renesas,usb2-phy-r8a7795"; + reg = <0 0xee0a0200 0 0x700>; + reg-names = "usb2_host"; + clocks = <&mstp7_clks R8A7795_CLK_EHCI0>; + }; diff --git a/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt b/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt index 826454ac4..68498d560 100644 --- a/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt +++ b/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt @@ -1,7 +1,10 @@ ROCKCHIP USB2 PHY Required properties: - - compatible: rockchip,rk3288-usb-phy + - compatible: matching the soc type, one of + "rockchip,rk3066a-usb-phy" + "rockchip,rk3188-usb-phy" + "rockchip,rk3288-usb-phy" - rockchip,grf : phandle to the syscon managing the "general register files" - #address-cells: should be 1 @@ -21,6 +24,7 @@ required properties: Optional Properties: - clocks : phandle + clock specifier for the phy clocks - clock-names: string, clock name, must be "phyclk" +- #clock-cells: for users of the phy-pll, should be 0 Example: diff --git a/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt b/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt index 0cebf7454..95736d77f 100644 --- a/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt +++ b/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt @@ -9,6 +9,7 @@ Required properties: * allwinner,sun7i-a20-usb-phy * allwinner,sun8i-a23-usb-phy * allwinner,sun8i-a33-usb-phy + * allwinner,sun8i-h3-usb-phy - reg : a list of offset + length pairs - reg-names : * "phy_ctrl" diff --git a/Documentation/devicetree/bindings/phy/ti-phy.txt b/Documentation/devicetree/bindings/phy/ti-phy.txt index 9cf9446ea..a3b394587 100644 --- a/Documentation/devicetree/bindings/phy/ti-phy.txt +++ b/Documentation/devicetree/bindings/phy/ti-phy.txt @@ -31,6 +31,8 @@ OMAP USB2 PHY Required properties: - compatible: Should be "ti,omap-usb2" + Should be "ti,dra7x-usb2-phy2" for the 2nd instance of USB2 PHY + in DRA7x - reg : Address and length of the register set for the device. - #phy-cells: determine the number of cells that should be given in the phandle while referencing this phy. @@ -40,10 +42,14 @@ Required properties: * "wkupclk" - wakeup clock. * "refclk" - reference clock (optional). -Optional properties: +Deprecated properties: - ctrl-module : phandle of the control module used by PHY driver to power on the PHY. +Recommended properies: +- syscon-phy-power : phandle/offset pair. Phandle to the system control + module and the register offset to power on/off the PHY. + This is usually a subnode of ocp2scp to which it is connected. usb2phy@4a0ad080 { @@ -77,14 +83,22 @@ Required properties: * "div-clk" - apll clock Optional properties: - - ctrl-module : phandle of the control module used by PHY driver to power on - the PHY. - id: If there are multiple instance of the same type, in order to differentiate between each instance "id" can be used (e.g., multi-lane PCIe PHY). If "id" is not provided, it is set to default value of '1'. - syscon-pllreset: Handle to system control region that contains the CTRL_CORE_SMA_SW_0 register and register offset to the CTRL_CORE_SMA_SW_0 register that contains the SATA_PLL_SOFT_RESET bit. Only valid for sata_phy. + - syscon-pcs : phandle/offset pair. Phandle to the system control module and the + register offset to write the PCS delay value. + +Deprecated properties: + - ctrl-module : phandle of the control module used by PHY driver to power on + the PHY. + +Recommended properies: + - syscon-phy-power : phandle/offset pair. Phandle to the system control + module and the register offset to power on/off the PHY. This is usually a subnode of ocp2scp to which it is connected. diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt index b321b2678..9213b27e1 100644 --- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt @@ -17,7 +17,10 @@ Required properties: "allwinner,sun8i-a23-pinctrl" "allwinner,sun8i-a23-r-pinctrl" "allwinner,sun8i-a33-pinctrl" + "allwinner,sun9i-a80-pinctrl" + "allwinner,sun9i-a80-r-pinctrl" "allwinner,sun8i-a83t-pinctrl" + "allwinner,sun8i-h3-pinctrl" - reg: Should contain the register physical address and length for the pin controller. diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,cygnus-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,cygnus-gpio.txt deleted file mode 100644 index 16589fb6f..000000000 --- a/Documentation/devicetree/bindings/pinctrl/brcm,cygnus-gpio.txt +++ /dev/null @@ -1,104 +0,0 @@ -Broadcom Cygnus GPIO/PINCONF Controller - -Required properties: - -- compatible: - Must be "brcm,cygnus-ccm-gpio", "brcm,cygnus-asiu-gpio", - "brcm,cygnus-crmu-gpio" or "brcm,iproc-gpio" - -- reg: - Define the base and range of the I/O address space that contains the Cygnus -GPIO/PINCONF controller registers - -- #gpio-cells: - Must be two. The first cell is the GPIO pin number (within the -controller's pin space) and the second cell is used for the following: - bit[0]: polarity (0 for active high and 1 for active low) - -- gpio-controller: - Specifies that the node is a GPIO controller - -Optional properties: - -- interrupts: - Interrupt ID - -- interrupt-controller: - Specifies that the node is an interrupt controller - -- gpio-ranges: - Specifies the mapping between gpio controller and pin-controllers pins. - This requires 4 fields in cells defined as - - 1. Phandle of pin-controller. - 2. GPIO base pin offset. - 3 Pin-control base pin offset. - 4. number of gpio pins which are linearly mapped from pin base. - -Supported generic PINCONF properties in child nodes: - -- pins: - The list of pins (within the controller's own pin space) that properties -in the node apply to. Pin names are "gpio-" - -- bias-disable: - Disable pin bias - -- bias-pull-up: - Enable internal pull up resistor - -- bias-pull-down: - Enable internal pull down resistor - -- drive-strength: - Valid drive strength values include 2, 4, 6, 8, 10, 12, 14, 16 (mA) - -Example: - gpio_ccm: gpio@1800a000 { - compatible = "brcm,cygnus-ccm-gpio"; - reg = <0x1800a000 0x50>, - <0x0301d164 0x20>; - #gpio-cells = <2>; - gpio-controller; - interrupts = ; - interrupt-controller; - - touch_pins: touch_pins { - pwr: pwr { - pins = "gpio-0"; - drive-strength = <16>; - }; - - event: event { - pins = "gpio-1"; - bias-pull-up; - }; - }; - }; - - gpio_asiu: gpio@180a5000 { - compatible = "brcm,cygnus-asiu-gpio"; - reg = <0x180a5000 0x668>; - #gpio-cells = <2>; - gpio-controller; - interrupts = ; - interrupt-controller; - gpio-ranges = <&pinctrl 0 42 1>, - <&pinctrl 1 44 3>; - }; - - /* - * Touchscreen that uses the CCM GPIO 0 and 1 - */ - tsc { - ... - ... - gpio-pwr = <&gpio_ccm 0 0>; - gpio-event = <&gpio_ccm 1 0>; - }; - - /* Bluetooth that uses the ASIU GPIO 5, with polarity inverted */ - bluetooth { - ... - ... - bcm,rfkill-bank-sel = <&gpio_asiu 5 1> - } diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt new file mode 100644 index 000000000..e4277921f --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt @@ -0,0 +1,109 @@ +Broadcom iProc GPIO/PINCONF Controller + +Required properties: + +- compatible: + Must be "brcm,cygnus-ccm-gpio", "brcm,cygnus-asiu-gpio", + "brcm,cygnus-crmu-gpio" or "brcm,iproc-gpio" + +- reg: + Define the base and range of the I/O address space that contains SoC +GPIO/PINCONF controller registers + +- ngpios: + Total number of in-use slots in GPIO controller + +- #gpio-cells: + Must be two. The first cell is the GPIO pin number (within the +controller's pin space) and the second cell is used for the following: + bit[0]: polarity (0 for active high and 1 for active low) + +- gpio-controller: + Specifies that the node is a GPIO controller + +Optional properties: + +- interrupts: + Interrupt ID + +- interrupt-controller: + Specifies that the node is an interrupt controller + +- gpio-ranges: + Specifies the mapping between gpio controller and pin-controllers pins. + This requires 4 fields in cells defined as - + 1. Phandle of pin-controller. + 2. GPIO base pin offset. + 3 Pin-control base pin offset. + 4. number of gpio pins which are linearly mapped from pin base. + +Supported generic PINCONF properties in child nodes: + +- pins: + The list of pins (within the controller's own pin space) that properties +in the node apply to. Pin names are "gpio-" + +- bias-disable: + Disable pin bias + +- bias-pull-up: + Enable internal pull up resistor + +- bias-pull-down: + Enable internal pull down resistor + +- drive-strength: + Valid drive strength values include 2, 4, 6, 8, 10, 12, 14, 16 (mA) + +Example: + gpio_ccm: gpio@1800a000 { + compatible = "brcm,cygnus-ccm-gpio"; + reg = <0x1800a000 0x50>, + <0x0301d164 0x20>; + ngpios = <24>; + #gpio-cells = <2>; + gpio-controller; + interrupts = ; + interrupt-controller; + + touch_pins: touch_pins { + pwr: pwr { + pins = "gpio-0"; + drive-strength = <16>; + }; + + event: event { + pins = "gpio-1"; + bias-pull-up; + }; + }; + }; + + gpio_asiu: gpio@180a5000 { + compatible = "brcm,cygnus-asiu-gpio"; + reg = <0x180a5000 0x668>; + ngpios = <146>; + #gpio-cells = <2>; + gpio-controller; + interrupts = ; + interrupt-controller; + gpio-ranges = <&pinctrl 0 42 1>, + <&pinctrl 1 44 3>; + }; + + /* + * Touchscreen that uses the CCM GPIO 0 and 1 + */ + tsc { + ... + ... + gpio-pwr = <&gpio_ccm 0 0>; + gpio-event = <&gpio_ccm 1 0>; + }; + + /* Bluetooth that uses the ASIU GPIO 5, with polarity inverted */ + bluetooth { + ... + ... + bcm,rfkill-bank-sel = <&gpio_asiu 5 1> + } diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,nsp-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,nsp-gpio.txt new file mode 100644 index 000000000..0844168a6 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,nsp-gpio.txt @@ -0,0 +1,80 @@ +Broadcom Northstar plus (NSP) GPIO/PINCONF Controller + +Required properties: +- compatible: + Must be "brcm,nsp-gpio-a" + +- reg: + Should contain the register physical address and length for each of + GPIO base, IO control registers + +- #gpio-cells: + Must be two. The first cell is the GPIO pin number (within the + controller's pin space) and the second cell is used for the following: + bit[0]: polarity (0 for active high and 1 for active low) + +- gpio-controller: + Specifies that the node is a GPIO controller + +- ngpios: + Number of gpios supported (58x25 supports 32 and 58x23 supports 24) + +Optional properties: +- interrupts: + Interrupt ID + +- interrupt-controller: + Specifies that the node is an interrupt controller + +- gpio-ranges: + Specifies the mapping between gpio controller and pin-controllers pins. + This requires 4 fields in cells defined as - + 1. Phandle of pin-controller. + 2. GPIO base pin offset. + 3 Pin-control base pin offset. + 4. number of gpio pins which are linearly mapped from pin base. + +Supported generic PINCONF properties in child nodes: +- pins: + The list of pins (within the controller's own pin space) that properties + in the node apply to. Pin names are "gpio-" + +- bias-disable: + Disable pin bias + +- bias-pull-up: + Enable internal pull up resistor + +- bias-pull-down: + Enable internal pull down resistor + +- drive-strength: + Valid drive strength values include 2, 4, 6, 8, 10, 12, 14, 16 (mA) + +Example: + + gpioa: gpio@18000020 { + compatible = "brcm,nsp-gpio-a"; + reg = <0x18000020 0x100>, + <0x1803f1c4 0x1c>; + #gpio-cells = <2>; + gpio-controller; + ngpios = <32>; + gpio-ranges = <&pinctrl 0 0 31>; + interrupt-controller; + interrupts = ; + + /* Hog a few default settings */ + pinctrl-names = "default"; + pinctrl-0 = <&led>; + led: led { + pins = "gpio-1"; + bias-pull-up; + }; + + pwr: pwr { + gpio-hog; + gpios = <3 1>; + output-high; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/lantiq,pinctrl-xway.txt b/Documentation/devicetree/bindings/pinctrl/lantiq,pinctrl-xway.txt index e89b46775..8e5216bcd 100644 --- a/Documentation/devicetree/bindings/pinctrl/lantiq,pinctrl-xway.txt +++ b/Documentation/devicetree/bindings/pinctrl/lantiq,pinctrl-xway.txt @@ -1,7 +1,16 @@ Lantiq XWAY pinmux controller Required properties: -- compatible: "lantiq,pinctrl-xway" or "lantiq,pinctrl-xr9" +- compatible: "lantiq,pinctrl-xway", (DEPRECATED: Use "lantiq,pinctrl-danube") + "lantiq,pinctrl-xr9", (DEPRECATED: Use "lantiq,xrx100-pinctrl" or + "lantiq,xrx200-pinctrl") + "lantiq,pinctrl-ase", (DEPRECATED: Use "lantiq,ase-pinctrl") + "lantiq,-pinctrl", where is: + "ase" (XWAY AMAZON Family) + "danube" (XWAY DANUBE Family) + "xrx100" (XWAY xRX100 Family) + "xrx200" (XWAY xRX200 Family) + "xrx300" (XWAY xRX300 Family) - reg: Should contain the physical address and length of the gpio/pinmux register range @@ -36,19 +45,87 @@ Required subnode-properties: Valid values for group and function names: +XWAY: (DEPRECATED: Use DANUBE) mux groups: exin0, exin1, exin2, jtag, ebu a23, ebu a24, ebu a25, ebu clk, ebu cs1, ebu wait, nand ale, nand cs1, nand cle, spi, spi_cs1, spi_cs2, spi_cs3, - spi_cs4, spi_cs5, spi_cs6, asc0, asc0 cts rts, stp, nmi , gpt1, gpt2, + spi_cs4, spi_cs5, spi_cs6, asc0, asc0 cts rts, stp, nmi, gpt1, gpt2, gpt3, clkout0, clkout1, clkout2, clkout3, gnt1, gnt2, gnt3, req1, req2, req3 - additional mux groups (XR9 only): - mdio, nand rdy, nand rd, exin3, exin4, gnt4, req4 + functions: + spi, asc, cgu, jtag, exin, stp, gpt, nmi, pci, ebu + +XR9: ( DEPRECATED: Use xRX100/xRX200) + mux groups: + exin0, exin1, exin2, exin3, exin4, jtag, ebu a23, ebu a24, ebu a25, + ebu clk, ebu cs1, ebu wait, nand ale, nand cs1, nand cle, nand rdy, + nand rd, spi, spi_cs1, spi_cs2, spi_cs3, spi_cs4, spi_cs5, spi_cs6, + asc0, asc0 cts rts, stp, nmi, gpt1, gpt2, gpt3, clkout0, clkout1, + clkout2, clkout3, gnt1, gnt2, gnt3, gnt4, req1, req2, req3, req4, mdio, + gphy0 led0, gphy0 led1, gphy0 led2, gphy1 led0, gphy1 led1, gphy1 led2 + + functions: + spi, asc, cgu, jtag, exin, stp, gpt, nmi, pci, ebu, mdio, gphy + +AMAZON: + mux groups: + exin0, exin1, exin2, jtag, spi_di, spi_do, spi_clk, spi_cs1, spi_cs2, + spi_cs3, spi_cs4, spi_cs5, spi_cs6, asc, stp, gpt1, gpt2, gpt3, clkout0, + clkout1, clkout2, mdio, dfe led0, dfe led1, ephy led0, ephy led1, ephy led2 + + functions: + spi, asc, cgu, jtag, exin, stp, gpt, mdio, ephy, dfe + +DANUBE: + mux groups: + exin0, exin1, exin2, jtag, ebu a23, ebu a24, ebu a25, ebu clk, ebu cs1, + ebu wait, nand ale, nand cs1, nand cle, spi_di, spi_do, spi_clk, spi_cs1, + spi_cs2, spi_cs3, spi_cs4, spi_cs5, spi_cs6, asc0, asc0 cts rts, stp, nmi, + gpt1, gpt2, gpt3, clkout0, clkout1, clkout2, clkout3, gnt1, gnt2, gnt3, + req1, req2, req3, dfe led0, dfe led1 functions: - spi, asc, cgu, jtag, exin, stp, gpt, nmi, pci, ebu, mdio + spi, asc, cgu, jtag, exin, stp, gpt, nmi, pci, ebu, dfe +xRX100: + mux groups: + exin0, exin1, exin2, exin3, exin4, ebu a23, ebu a24, ebu a25, ebu clk, + ebu cs1, ebu wait, nand ale, nand cs1, nand cle, nand rdy, nand rd, + spi_di, spi_do, spi_clk, spi_cs1, spi_cs2, spi_cs3, spi_cs4, spi_cs5, + spi_cs6, asc0, asc0 cts rts, stp, nmi, gpt1, gpt2, gpt3, clkout0, clkout1, + clkout2, clkout3, gnt1, gnt2, gnt3, gnt4, req1, req2, req3, req4, mdio, + dfe led0, dfe led1 + + functions: + spi, asc, cgu, exin, stp, gpt, nmi, pci, ebu, mdio, dfe + +xRX200: + mux groups: + exin0, exin1, exin2, exin3, exin4, ebu a23, ebu a24, ebu a25, ebu clk, + ebu cs1, ebu wait, nand ale, nand cs1, nand cle, nand rdy, nand rd, + spi_di, spi_do, spi_clk, spi_cs1, spi_cs2, spi_cs3, spi_cs4, spi_cs5, + spi_cs6, usif uart_rx, usif uart_tx, usif uart_rts, usif uart_cts, + usif uart_dtr, usif uart_dsr, usif uart_dcd, usif uart_ri, usif spi_di, + usif spi_do, usif spi_clk, usif spi_cs0, usif spi_cs1, usif spi_cs2, + stp, nmi, gpt1, gpt2, gpt3, clkout0, clkout1, clkout2, clkout3, gnt1, + gnt2, gnt3, gnt4, req1, req2, req3, req4, mdio, dfe led0, dfe led1, + gphy0 led0, gphy0 led1, gphy0 led2, gphy1 led0, gphy1 led1, gphy1 led2 + + functions: + spi, usif, cgu, exin, stp, gpt, nmi, pci, ebu, mdio, dfe, gphy + +xRX300: + mux groups: + exin0, exin1, exin2, exin4, nand ale, nand cs0, nand cs1, nand cle, + nand rdy, nand rd, nand_d0, nand_d1, nand_d2, nand_d3, nand_d4, nand_d5, + nand_d6, nand_d7, nand_d1, nand wr, nand wp, nand se, spi_di, spi_do, + spi_clk, spi_cs1, spi_cs4, spi_cs6, usif uart_rx, usif uart_tx, + usif spi_di, usif spi_do, usif spi_clk, usif spi_cs0, stp, clkout2, + mdio, dfe led0, dfe led1, ephy0 led0, ephy0 led1, ephy1 led0, ephy1 led1 + + functions: + spi, usif, cgu, exin, stp, ebu, mdio, dfe, ephy Definition of pin configurations: @@ -62,15 +139,32 @@ Optional subnode-properties: 0: none, 1: down, 2: up. - lantiq,open-drain: Boolean, enables open-drain on the defined pin. -Valid values for XWAY pin names: +Valid values for XWAY pin names: (DEPRECATED: Use DANUBE) Pinconf pins can be referenced via the names io0-io31. -Valid values for XR9 pin names: +Valid values for XR9 pin names: (DEPRECATED: Use xrX100/xRX200) Pinconf pins can be referenced via the names io0-io55. +Valid values for AMAZON pin names: + Pinconf pins can be referenced via the names io0-io31. + +Valid values for DANUBE pin names: + Pinconf pins can be referenced via the names io0-io31. + +Valid values for xRX100 pin names: + Pinconf pins can be referenced via the names io0-io55. + +Valid values for xRX200 pin names: + Pinconf pins can be referenced via the names io0-io49. + +Valid values for xRX300 pin names: + Pinconf pins can be referenced via the names io0-io1,io3-io6,io8-io11, + io13-io19,io23-io27,io34-io36, + io42-io43,io48-io61. + Example: gpio: pinmux@E100B10 { - compatible = "lantiq,pinctrl-xway"; + compatible = "lantiq,danube-pinctrl"; pinctrl-names = "default"; pinctrl-0 = <&state_default>; diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt index 0480bc31b..9ffb0b276 100644 --- a/Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt +++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt @@ -4,10 +4,11 @@ The Mediatek's Pin controller is used to control SoC pins. Required properties: - compatible: value should be one of the following. - (a) "mediatek,mt8135-pinctrl", compatible with mt8135 pinctrl. - (b) "mediatek,mt8173-pinctrl", compatible with mt8173 pinctrl. - (c) "mediatek,mt6397-pinctrl", compatible with mt6397 pinctrl. - (d) "mediatek,mt8127-pinctrl", compatible with mt8127 pinctrl. + "mediatek,mt2701-pinctrl", compatible with mt2701 pinctrl. + "mediatek,mt6397-pinctrl", compatible with mt6397 pinctrl. + "mediatek,mt8127-pinctrl", compatible with mt8127 pinctrl. + "mediatek,mt8135-pinctrl", compatible with mt8135 pinctrl. + "mediatek,mt8173-pinctrl", compatible with mt8173 pinctrl. - pins-are-numbered: Specify the subnodes are using numbered pinmux to specify pins. - gpio-controller : Marks the device node as a gpio controller. diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt new file mode 100644 index 000000000..e312a71b2 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt @@ -0,0 +1,199 @@ +Qualcomm MSM8996 TLMM block + +This binding describes the Top Level Mode Multiplexer block found in the +MSM8996 platform. + +- compatible: + Usage: required + Value type: + Definition: must be "qcom,msm8996-pinctrl" + +- reg: + Usage: required + Value type: + Definition: the base address and size of the TLMM register space. + +- interrupts: + Usage: required + Value type: + Definition: should specify the TLMM summary IRQ. + +- interrupt-controller: + Usage: required + Value type: + Definition: identifies this node as an interrupt controller + +- #interrupt-cells: + Usage: required + Value type: + Definition: must be 2. Specifying the pin number and flags, as defined + in + +- gpio-controller: + Usage: required + Value type: + Definition: identifies this node as a gpio controller + +- #gpio-cells: + Usage: required + Value type: + Definition: must be 2. Specifying the pin number and flags, as defined + in + +Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for +a general description of GPIO and interrupt bindings. + +Please refer to pinctrl-bindings.txt in this directory for details of the +common pinctrl bindings used by client devices, including the meaning of the +phrase "pin configuration node". + +The pin configuration nodes act as a container for an arbitrary number of +subnodes. Each of these subnodes represents some desired configuration for a +pin, a group, or a list of pins or groups. This configuration can include the +mux function to select on those pin(s)/group(s), and various pin configuration +parameters, such as pull-up, drive strength, etc. + + +PIN CONFIGURATION NODES: + +The name of each subnode is not important; all subnodes should be enumerated +and processed purely based on their content. + +Each subnode only affects those parameters that are explicitly listed. In +other words, a subnode that lists a mux function but no pin configuration +parameters implies no information about any pin configuration parameters. +Similarly, a pin subnode that describes a pullup parameter implies no +information about e.g. the mux function. + + +The following generic properties as defined in pinctrl-bindings.txt are valid +to specify in a pin configuration subnode: + +- pins: + Usage: required + Value type: + Definition: List of gpio pins affected by the properties specified in + this subnode. + + Valid pins are: + gpio0-gpio149 + Supports mux, bias and drive-strength + + sdc1_clk, sdc1_cmd, sdc1_data sdc2_clk, sdc2_cmd, + sdc2_data sdc1_rclk + Supports bias and drive-strength + +- function: + Usage: required + Value type: + Definition: Specify the alternative function to be configured for the + specified pins. Functions are only valid for gpio pins. + Valid values are: + + blsp_uart1, blsp_spi1, blsp_i2c1, blsp_uim1, atest_tsens, + bimc_dte1, dac_calib0, blsp_spi8, blsp_uart8, blsp_uim8, + qdss_cti_trig_out_b, bimc_dte0, dac_calib1, qdss_cti_trig_in_b, + dac_calib2, atest_tsens2, atest_usb1, blsp_spi10, blsp_uart10, + blsp_uim10, atest_bbrx1, atest_usb13, atest_bbrx0, atest_usb12, + mdp_vsync, edp_lcd, blsp_i2c10, atest_gpsadc1, atest_usb11, + atest_gpsadc0, edp_hot, atest_usb10, m_voc, dac_gpio, atest_char, + cam_mclk, pll_bypassnl, qdss_stm7, blsp_i2c8, qdss_tracedata_b, + pll_reset, qdss_stm6, qdss_stm5, qdss_stm4, atest_usb2, cci_i2c, + qdss_stm3, dac_calib3, atest_usb23, atest_char3, dac_calib4, + qdss_stm2, atest_usb22, atest_char2, qdss_stm1, dac_calib5, + atest_usb21, atest_char1, dbg_out, qdss_stm0, dac_calib6, + atest_usb20, atest_char0, dac_calib10, qdss_stm10, + qdss_cti_trig_in_a, cci_timer4, blsp_spi6, blsp_uart6, blsp_uim6, + blsp2_spi, qdss_stm9, qdss_cti_trig_out_a, dac_calib11, + qdss_stm8, cci_timer0, qdss_stm13, dac_calib7, cci_timer1, + qdss_stm12, dac_calib8, cci_timer2, blsp1_spi, qdss_stm11, + dac_calib9, cci_timer3, cci_async, dac_calib12, blsp_i2c6, + qdss_tracectl_a, dac_calib13, qdss_traceclk_a, dac_calib14, + dac_calib15, hdmi_rcv, dac_calib16, hdmi_cec, pwr_modem, + dac_calib17, hdmi_ddc, pwr_nav, dac_calib18, pwr_crypto, + dac_calib19, hdmi_hot, dac_calib20, dac_calib21, pci_e0, + dac_calib22, dac_calib23, dac_calib24, tsif1_sync, dac_calib25, + sd_write, tsif1_error, blsp_spi2, blsp_uart2, blsp_uim2, + qdss_cti, blsp_i2c2, blsp_spi3, blsp_uart3, blsp_uim3, blsp_i2c3, + uim3, blsp_spi9, blsp_uart9, blsp_uim9, blsp10_spi, blsp_i2c9, + blsp_spi7, blsp_uart7, blsp_uim7, qdss_tracedata_a, blsp_i2c7, + qua_mi2s, gcc_gp1_clk_a, ssc_irq, uim4, blsp_spi11, blsp_uart11, + blsp_uim11, gcc_gp2_clk_a, gcc_gp3_clk_a, blsp_i2c11, cri_trng0, + cri_trng1, cri_trng, qdss_stm18, pri_mi2s, qdss_stm17, blsp_spi4, + blsp_uart4, blsp_uim4, qdss_stm16, qdss_stm15, blsp_i2c4, + qdss_stm14, dac_calib26, spkr_i2s, audio_ref, lpass_slimbus, + isense_dbg, tsense_pwm1, tsense_pwm2, btfm_slimbus, ter_mi2s, + qdss_stm22, qdss_stm21, qdss_stm20, qdss_stm19, gcc_gp1_clk_b, + sec_mi2s, blsp_spi5, blsp_uart5, blsp_uim5, gcc_gp2_clk_b, + gcc_gp3_clk_b, blsp_i2c5, blsp_spi12, blsp_uart12, blsp_uim12, + qdss_stm25, qdss_stm31, blsp_i2c12, qdss_stm30, qdss_stm29, + tsif1_clk, qdss_stm28, tsif1_en, tsif1_data, sdc4_cmd, qdss_stm27, + qdss_traceclk_b, tsif2_error, sdc43, vfr_1, qdss_stm26, tsif2_clk, + sdc4_clk, qdss_stm24, tsif2_en, sdc42, qdss_stm23, qdss_tracectl_b, + sd_card, tsif2_data, sdc41, tsif2_sync, sdc40, mdp_vsync_p_b, + ldo_en, mdp_vsync_s_b, ldo_update, blsp11_uart_tx_b, blsp11_uart_rx_b, + blsp11_i2c_sda_b, prng_rosc, blsp11_i2c_scl_b, uim2, uim1, uim_batt, + pci_e2, pa_indicator, adsp_ext, ddr_bist, qdss_tracedata_11, + qdss_tracedata_12, modem_tsync, nav_dr, nav_pps, pci_e1, gsm_tx, + qspi_cs, ssbi2, ssbi1, mss_lte, qspi_clk, qspi0, qspi1, qspi2, qspi3, + gpio + +- bias-disable: + Usage: optional + Value type: + Definition: The specified pins should be configued as no pull. + +- bias-pull-down: + Usage: optional + Value type: + Definition: The specified pins should be configued as pull down. + +- bias-pull-up: + Usage: optional + Value type: + Definition: The specified pins should be configued as pull up. + +- output-high: + Usage: optional + Value type: + Definition: The specified pins are configured in output mode, driven + high. + Not valid for sdc pins. + +- output-low: + Usage: optional + Value type: + Definition: The specified pins are configured in output mode, driven + low. + Not valid for sdc pins. + +- drive-strength: + Usage: optional + Value type: + Definition: Selects the drive strength for the specified pins, in mA. + Valid values are: 2, 4, 6, 8, 10, 12, 14 and 16 + +Example: + + tlmm: pinctrl@01010000 { + compatible = "qcom,msm8996-pinctrl"; + reg = <0x01010000 0x300000>; + interrupts = <0 208 0>; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + + uart_console_active: uart_console_active { + mux { + pins = "gpio4", "gpio5"; + function = "blsp_uart8"; + }; + + config { + pins = "gpio4", "gpio5"; + drive-strength = <2>; + bias-disable; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt index 1ae63c0ac..a90c812ad 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt @@ -14,6 +14,7 @@ PMIC's from Qualcomm. "qcom,pm8917-gpio" "qcom,pm8921-gpio" "qcom,pm8941-gpio" + "qcom,pm8994-gpio" "qcom,pma8084-gpio" - reg: @@ -79,6 +80,7 @@ to specify in a pin configuration subnode: gpio1-gpio38 for pm8917 gpio1-gpio44 for pm8921 gpio1-gpio36 for pm8941 + gpio1-gpio22 for pm8994 gpio1-gpio22 for pma8084 - function: diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-mpp.txt b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-mpp.txt index d7803a2a9..d74e631e1 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-mpp.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-mpp.txt @@ -15,6 +15,7 @@ of PMIC's from Qualcomm. "qcom,pm8917-mpp", "qcom,pm8921-mpp", "qcom,pm8941-mpp", + "qcom,pm8994-mpp", "qcom,pma8084-mpp", - reg: diff --git a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt index 391ef4be8..0cd701b19 100644 --- a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt @@ -21,7 +21,8 @@ defined as gpio sub-nodes of the pinmux controller. Required properties for iomux controller: - compatible: one of "rockchip,rk2928-pinctrl", "rockchip,rk3066a-pinctrl" "rockchip,rk3066b-pinctrl", "rockchip,rk3188-pinctrl" - "rockchip,rk3288-pinctrl", "rockchip,rk3368-pinctrl" + "rockchip,rk3228-pinctrl", "rockchip,rk3288-pinctrl" + "rockchip,rk3368-pinctrl" - rockchip,grf: phandle referencing a syscon providing the "general register files" diff --git a/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt index 9d2a99529..6db16b908 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt @@ -17,6 +17,7 @@ Required Properties: - "samsung,exynos4x12-pinctrl": for Exynos4x12 compatible pin-controller. - "samsung,exynos5250-pinctrl": for Exynos5250 compatible pin-controller. - "samsung,exynos5260-pinctrl": for Exynos5260 compatible pin-controller. + - "samsung,exynos5410-pinctrl": for Exynos5410 compatible pin-controller. - "samsung,exynos5420-pinctrl": for Exynos5420 compatible pin-controller. - "samsung,exynos7-pinctrl": for Exynos7 compatible pin-controller. diff --git a/Documentation/devicetree/bindings/pwm/lpc32xx-pwm.txt b/Documentation/devicetree/bindings/pwm/lpc32xx-pwm.txt index cfe1db3bb..74b5bc5dd 100644 --- a/Documentation/devicetree/bindings/pwm/lpc32xx-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/lpc32xx-pwm.txt @@ -6,7 +6,12 @@ Required properties: Examples: -pwm@0x4005C000 { +pwm@4005c000 { compatible = "nxp,lpc3220-pwm"; - reg = <0x4005C000 0x8>; + reg = <0x4005c000 0x4>; +}; + +pwm@4005c004 { + compatible = "nxp,lpc3220-pwm"; + reg = <0x4005c004 0x4>; }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-omap-dmtimer.txt b/Documentation/devicetree/bindings/pwm/pwm-omap-dmtimer.txt new file mode 100644 index 000000000..5befb538d --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/pwm-omap-dmtimer.txt @@ -0,0 +1,18 @@ +* OMAP PWM for dual-mode timers + +Required properties: +- compatible: Shall contain "ti,omap-dmtimer-pwm". +- ti,timers: phandle to PWM capable OMAP timer. See arm/omap/timer.txt for info + about these timers. +- #pwm-cells: Should be 3. See pwm.txt in this directory for a description of + the cells format. + +Optional properties: +- ti,prescaler: Should be a value between 0 and 7, see the timers datasheet + +Example: + pwm9: dmtimer-pwm@9 { + compatible = "ti,omap-dmtimer-pwm"; + ti,timers = <&timer9>; + #pwm-cells = <3>; + }; diff --git a/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt b/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt new file mode 100644 index 000000000..8f14df9d1 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt @@ -0,0 +1,34 @@ +TI LMU LM363x regulator device tree bindings + +LM363x regulator driver supports LM3631 and LM3632. +LM3631 has five regulators and LM3632 supports three regulators. + +Required property: + - compatible: "ti,lm363x-regulator" + +Optional properties: + LM3632 has external enable pins for two LDOs. + - ti,lcm-en1-gpio: A GPIO specifier for Vpos control pin. + - ti,lcm-en2-gpio: A GPIO specifier for Vneg control pin. + +Child nodes: + LM3631 + - vboost + - vcont + - voref + - vpos + - vneg + + LM3632 + - vboost + - vpos + - vneg + + Optional properties of a child node: + Each sub-node should contain the constraints and initialization. + Please refer to [1]. + +Examples: Please refer to ti-lmu dt-bindings [2]. + +[1] ../regulator/regulator.txt +[2] ../mfd/ti-lmu.txt diff --git a/Documentation/devicetree/bindings/regulator/pv88060.txt b/Documentation/devicetree/bindings/regulator/pv88060.txt new file mode 100644 index 000000000..10a6dadc0 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/pv88060.txt @@ -0,0 +1,124 @@ +* Powerventure Semiconductor PV88060 Voltage Regulator + +Required properties: +- compatible: "pvs,pv88060". +- reg: I2C slave address, usually 0x49. +- interrupts: the interrupt outputs of the controller +- regulators: A node that houses a sub-node for each regulator within the + device. Each sub-node is identified using the node's name, with valid + values listed below. The content of each sub-node is defined by the + standard binding for regulators; see regulator.txt. + BUCK1, LDO1, LDO2, LDO3, LDO4, LDO5, LDO6, LDO7, SW1, SW2, SW3, SW4, + SW5, and SW6. + +Optional properties: +- Any optional property defined in regulator.txt + +Example + + pmic: pv88060@49 { + compatible = "pvs,pv88060"; + reg = <0x49>; + interrupt-parent = <&gpio>; + interrupts = <24 24>; + + regulators { + BUCK1 { + regulator-name = "buck1"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <4387500>; + regulator-min-microamp = <1496000>; + regulator-max-microamp = <4189000>; + regulator-boot-on; + }; + + LDO1 { + regulator-name = "ldo1"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + LDO2 { + regulator-name = "ldo2"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + LDO3 { + regulator-name = "ldo3"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + LDO4 { + regulator-name = "ldo4"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + LDO5 { + regulator-name = "ldo5"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + LDO6 { + regulator-name = "ldo6"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + LDO7 { + regulator-name = "ldo7"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3350000>; + regulator-boot-on; + }; + + SW1 { + regulator-name = "sw1"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + }; + + SW2 { + regulator-name = "sw2"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + }; + + SW3 { + regulator-name = "sw3"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + }; + + SW4 { + regulator-name = "sw4"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + }; + + SW5 { + regulator-name = "sw5"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-boot-on; + }; + + SW6 { + regulator-name = "sw6"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + }; + }; + }; \ No newline at end of file diff --git a/Documentation/devicetree/bindings/regulator/pv88090.txt b/Documentation/devicetree/bindings/regulator/pv88090.txt new file mode 100644 index 000000000..e52b2a95c --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/pv88090.txt @@ -0,0 +1,65 @@ +* Powerventure Semiconductor PV88090 Voltage Regulator + +Required properties: +- compatible: "pvs,pv88090". +- reg: I2C slave address, usually 0x48. +- interrupts: the interrupt outputs of the controller +- regulators: A node that houses a sub-node for each regulator within the + device. Each sub-node is identified using the node's name, with valid + values listed below. The content of each sub-node is defined by the + standard binding for regulators; see regulator.txt. + BUCK1, BUCK2, BUCK3, LDO1, and LDO2. + +Optional properties: +- Any optional property defined in regulator.txt + +Example + + pmic: pv88090@48 { + compatible = "pvs,pv88090"; + reg = <0x48>; + interrupt-parent = <&gpio>; + interrupts = <24 24>; + + regulators { + BUCK1 { + regulator-name = "buck1"; + regulator-min-microvolt = < 600000>; + regulator-max-microvolt = <1393750>; + regulator-min-microamp = < 220000>; + regulator-max-microamp = <7040000>; + regulator-boot-on; + }; + + BUCK2 { + regulator-name = "buck2"; + regulator-min-microvolt = < 600000>; + regulator-max-microvolt = <1393750>; + regulator-min-microamp = <1496000>; + regulator-max-microamp = <4189000>; + }; + + BUCK3 { + regulator-name = "buck3"; + regulator-min-microvolt = <600000>; + regulator-max-microvolt = <1393750>; + regulator-min-microamp = <1496000>; + regulator-max-microamp = <4189000>; + regulator-boot-on; + }; + + LDO1 { + regulator-name = "ldo1"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <4350000>; + regulator-boot-on; + }; + + LDO2 { + regulator-name = "ldo2"; + regulator-min-microvolt = < 650000>; + regulator-max-microvolt = <2225000>; + regulator-boot-on; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt new file mode 100644 index 000000000..1f8d6f84b --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt @@ -0,0 +1,159 @@ +QCOM SMD RPM REGULATOR + +The Qualcomm RPM over SMD regulator is modelled as a subdevice of the RPM. +Because SMD is used as the communication transport mechanism, the RPM resides as +a subnode of the SMD. As such, the SMD-RPM regulator requires that the SMD and +RPM nodes be present. + +Please refer to Documentation/devicetree/bindings/soc/qcom/qcom,smd.txt for +information pertaining to the SMD node. + +Please refer to Documentation/devicetree/bindings/soc/qcom/qcom,smd-rpm.txt for +information regarding the RPM node. + +== Regulator + +Regulator nodes are identified by their compatible: + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,rpm-pm8841-regulators" + "qcom,rpm-pm8916-regulators" + "qcom,rpm-pm8941-regulators" + "qcom,rpm-pma8084-regulators" + +- vdd_s1-supply: +- vdd_s2-supply: +- vdd_s3-supply: +- vdd_s4-supply: +- vdd_s5-supply: +- vdd_s6-supply: +- vdd_s7-supply: +- vdd_s8-supply: + Usage: optional (pm8841 only) + Value type: + Definition: reference to regulator supplying the input pin, as + described in the data sheet + +- vdd_s1-supply: +- vdd_s2-supply: +- vdd_s3-supply: +- vdd_s4-supply: +- vdd_l1_l2_l3-supply: +- vdd_l4_l5_l6-supply: +- vdd_l7-supply: +- vdd_l8_l9_l10_l11_l12_l13_l14_l15_l16_l17_l18-supply: + Usage: optional (pm8916 only) + Value type: + Definition: reference to regulator supplying the input pin, as + described in the data sheet + +- vdd_s1-supply: +- vdd_s2-supply: +- vdd_s3-supply: +- vdd_l1_l3-supply: +- vdd_l2_lvs1_2_3-supply: +- vdd_l4_l11-supply: +- vdd_l5_l7-supply: +- vdd_l6_l12_l14_l15-supply: +- vdd_l8_l16_l18_l19-supply: +- vdd_l9_l10_l17_l22-supply: +- vdd_l13_l20_l23_l24-supply: +- vdd_l21-supply: +- vin_5vs-supply: + Usage: optional (pm8941 only) + Value type: + Definition: reference to regulator supplying the input pin, as + described in the data sheet + +- vdd_s1-supply: +- vdd_s2-supply: +- vdd_s3-supply: +- vdd_s4-supply: +- vdd_s5-supply: +- vdd_s6-supply: +- vdd_s7-supply: +- vdd_s8-supply: +- vdd_s9-supply: +- vdd_s10-supply: +- vdd_s11-supply: +- vdd_s12-supply: +- vdd_l1_l11-supply: +- vdd_l2_l3_l4_l27-supply: +- vdd_l5_l7-supply: +- vdd_l6_l12_l14_l15_l26-supply: +- vdd_l8-supply: +- vdd_l9_l10_l13_l20_l23_l24-supply: +- vdd_l16_l25-supply: +- vdd_l17-supply: +- vdd_l18-supply: +- vdd_l19-supply: +- vdd_l21-supply: +- vdd_l22-supply: + Usage: optional (pma8084 only) + Value type: + Definition: reference to regulator supplying the input pin, as + described in the data sheet + +The regulator node houses sub-nodes for each regulator within the device. Each +sub-node is identified using the node's name, with valid values listed for each +of the pmics below. + +pm8841: + s1, s2, s3, s4, s5, s6, s7, s8 + +pm8916: + s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, l16, l17, l18 + +pm8941: + s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, lvs1, lvs2, + lvs3, 5vs1, 5vs2 + +pma8084: + s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, l1, l2, l3, l4, l5, + l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18, l19, l20, + l21, l22, l23, l24, l25, l26, l27, lvs1, lvs2, lvs3, lvs4, 5vs1 + +The content of each sub-node is defined by the standard binding for regulators - +see regulator.txt. + += EXAMPLE + + smd { + compatible = "qcom,smd"; + + rpm { + interrupts = <0 168 1>; + qcom,ipc = <&apcs 8 0>; + qcom,smd-edge = <15>; + + rpm_requests { + compatible = "qcom,rpm-msm8974"; + qcom,smd-channels = "rpm_requests"; + + pm8941-regulators { + compatible = "qcom,rpm-pm8941-regulators"; + vdd_l13_l20_l23_l24-supply = <&pm8941_boost>; + + pm8941_s3: s3 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + + pm8941_boost: s4 { + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + }; + + pm8941_l20: l20 { + regulator-min-microvolt = <2950000>; + regulator-max-microvolt = <2950000>; + }; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/s5m8767-regulator.txt b/Documentation/devicetree/bindings/regulator/s5m8767-regulator.txt deleted file mode 100644 index 20191315e..000000000 --- a/Documentation/devicetree/bindings/regulator/s5m8767-regulator.txt +++ /dev/null @@ -1,163 +0,0 @@ -* Samsung S5M8767 Voltage and Current Regulator - -The Samsung S5M8767 is a multi-function device which includes voltage and -current regulators, rtc, charger controller and other sub-blocks. It is -interfaced to the host controller using a i2c interface. Each sub-block is -addressed by the host system using different i2c slave address. This document -describes the bindings for 'pmic' sub-block of s5m8767. - -Required properties: -- compatible: Should be "samsung,s5m8767-pmic". -- reg: Specifies the i2c slave address of the pmic block. It should be 0x66. - -- s5m8767,pmic-buck2-dvs-voltage: A set of 8 voltage values in micro-volt (uV) - units for buck2 when changing voltage using gpio dvs. Refer to [1] below - for additional information. - -- s5m8767,pmic-buck3-dvs-voltage: A set of 8 voltage values in micro-volt (uV) - units for buck3 when changing voltage using gpio dvs. Refer to [1] below - for additional information. - -- s5m8767,pmic-buck4-dvs-voltage: A set of 8 voltage values in micro-volt (uV) - units for buck4 when changing voltage using gpio dvs. Refer to [1] below - for additional information. - -- s5m8767,pmic-buck-ds-gpios: GPIO specifiers for three host gpio's used - for selecting GPIO DVS lines. It is one-to-one mapped to dvs gpio lines. - -[1] If none of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional - property is specified, the 's5m8767,pmic-buck[2/3/4]-dvs-voltage' - property should specify atleast one voltage level (which would be a - safe operating voltage). - - If either of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional - property is specified, then all the eight voltage values for the - 's5m8767,pmic-buck[2/3/4]-dvs-voltage' should be specified. - -Optional properties: -- interrupt-parent: Specifies the phandle of the interrupt controller to which - the interrupts from s5m8767 are delivered to. -- interrupts: Interrupt specifiers for two interrupt sources. - - First interrupt specifier is for 'irq1' interrupt. - - Second interrupt specifier is for 'alert' interrupt. -- s5m8767,pmic-buck2-uses-gpio-dvs: 'buck2' can be controlled by gpio dvs. -- s5m8767,pmic-buck3-uses-gpio-dvs: 'buck3' can be controlled by gpio dvs. -- s5m8767,pmic-buck4-uses-gpio-dvs: 'buck4' can be controlled by gpio dvs. - -Additional properties required if either of the optional properties are used: - -- s5m8767,pmic-buck234-default-dvs-idx: Default voltage setting selected from - the possible 8 options selectable by the dvs gpios. The value of this - property should be between 0 and 7. If not specified or if out of range, the - default value of this property is set to 0. - -- s5m8767,pmic-buck-dvs-gpios: GPIO specifiers for three host gpio's used - for dvs. The format of the gpio specifier depends in the gpio controller. - -Regulators: The regulators of s5m8767 that have to be instantiated should be -included in a sub-node named 'regulators'. Regulator nodes included in this -sub-node should be of the format as listed below. - - regulator_name { - ldo1_reg: LDO1 { - regulator-name = "VDD_ALIVE_1.0V"; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1100000>; - regulator-always-on; - regulator-boot-on; - op_mode = <1>; /* Normal Mode */ - }; - }; -The above regulator entries are defined in regulator bindings documentation -except these properties: - - op_mode: describes the different operating modes of the LDO's with - power mode change in SOC. The different possible values are, - 0 - always off mode - 1 - on in normal mode - 2 - low power mode - 3 - suspend mode - - s5m8767,pmic-ext-control-gpios: (optional) GPIO specifier for one - GPIO controlling this regulator (enable/disable); This is - valid only for buck9. - -The following are the names of the regulators that the s5m8767 pmic block -supports. Note: The 'n' in LDOn and BUCKn represents the LDO or BUCK number -as per the datasheet of s5m8767. - - - LDOn - - valid values for n are 1 to 28 - - Example: LDO1, LDO2, LDO28 - - BUCKn - - valid values for n are 1 to 9. - - Example: BUCK1, BUCK2, BUCK9 - -The bindings inside the regulator nodes use the standard regulator bindings -which are documented elsewhere. - -Example: - - s5m8767_pmic@66 { - compatible = "samsung,s5m8767-pmic"; - reg = <0x66>; - - s5m8767,pmic-buck2-uses-gpio-dvs; - s5m8767,pmic-buck3-uses-gpio-dvs; - s5m8767,pmic-buck4-uses-gpio-dvs; - - s5m8767,pmic-buck-default-dvs-idx = <0>; - - s5m8767,pmic-buck-dvs-gpios = <&gpx0 0 0>, /* DVS1 */ - <&gpx0 1 0>, /* DVS2 */ - <&gpx0 2 0>; /* DVS3 */ - - s5m8767,pmic-buck-ds-gpios = <&gpx2 3 0>, /* SET1 */ - <&gpx2 4 0>, /* SET2 */ - <&gpx2 5 0>; /* SET3 */ - - s5m8767,pmic-buck2-dvs-voltage = <1350000>, <1300000>, - <1250000>, <1200000>, - <1150000>, <1100000>, - <1000000>, <950000>; - - s5m8767,pmic-buck3-dvs-voltage = <1100000>, <1100000>, - <1100000>, <1100000>, - <1000000>, <1000000>, - <1000000>, <1000000>; - - s5m8767,pmic-buck4-dvs-voltage = <1200000>, <1200000>, - <1200000>, <1200000>, - <1200000>, <1200000>, - <1200000>, <1200000>; - - regulators { - ldo1_reg: LDO1 { - regulator-name = "VDD_ABB_3.3V"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - op_mode = <1>; /* Normal Mode */ - }; - - ldo2_reg: LDO2 { - regulator-name = "VDD_ALIVE_1.1V"; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1100000>; - regulator-always-on; - }; - - buck1_reg: BUCK1 { - regulator-name = "VDD_MIF_1.2V"; - regulator-min-microvolt = <950000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - regulator-boot-on; - }; - - vemmc_reg: BUCK9 { - regulator-name = "VMEM_VDD_2.8V"; - regulator-min-microvolt = <2800000>; - regulator-max-microvolt = <2800000>; - op_mode = <3>; /* Standby Mode */ - s5m8767,pmic-ext-control-gpios = <&gpk0 2 0>; - }; - }; - }; diff --git a/Documentation/devicetree/bindings/regulator/samsung,s2mpa01.txt b/Documentation/devicetree/bindings/regulator/samsung,s2mpa01.txt new file mode 100644 index 000000000..bae3c7f83 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/samsung,s2mpa01.txt @@ -0,0 +1,79 @@ +Binding for Samsung S2MPA01 regulator block +=========================================== + +This is a part of device tree bindings for S2M family multi-function devices. +More information can be found in bindings/mfd/sec-core.txt file. + +The S2MPA01 device provide buck and LDO regulators. + +To register these with regulator framework instantiate under main device node +a sub-node named "regulators" with more sub-nodes for each regulator using the +common regulator binding documented in: + - Documentation/devicetree/bindings/regulator/regulator.txt + + +Names of regulators supported by S2MPA01 device: + - LDOn + - valid values for n are 1 to 26 + - Example: LDO1, LD02, LDO26 + - BUCKn + - valid values for n are 1 to 10. + - Example: BUCK1, BUCK2, BUCK9 +Note: The 'n' in LDOn and BUCKn represents the LDO or BUCK number +as per the datasheet of device. + + +Optional properties of buck regulator nodes under "regulators" sub-node: + - regulator-ramp-delay: ramp delay in uV/us. May be 6250, 12500 + (default), 25000, or 50000. May be 0 for disabling the ramp delay on + BUCK{1,2,3,4}. + + In the absence of the regulator-ramp-delay property, the default ramp + delay will be used. + + Note: Some bucks share the ramp rate setting i.e. same ramp value + will be set for a particular group of bucks so provide the same + regulator-ramp-delay value for them. + Groups sharing ramp rate: + - buck{1,6}, + - buck{2,4}, + - buck{8,9,10}. + +Example: + + s2mpa01_pmic@66 { + compatible = "samsung,s2mpa01-pmic"; + reg = <0x66>; + + regulators { + ldo1_reg: LDO1 { + regulator-name = "VDD_ALIVE"; + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <1000000>; + }; + + ldo2_reg: LDO2 { + regulator-name = "VDDQ_MMC2"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <2800000>; + regulator-always-on; + }; + + buck1_reg: BUCK1 { + regulator-name = "vdd_mif"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + }; + + buck2_reg: BUCK2 { + regulator-name = "vdd_arm"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + regulator-ramp-delay = <50000>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/samsung,s2mps11.txt b/Documentation/devicetree/bindings/regulator/samsung,s2mps11.txt new file mode 100644 index 000000000..27a48bf1b --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/samsung,s2mps11.txt @@ -0,0 +1,102 @@ +Binding for Samsung S2M family regulator block +============================================== + +This is a part of device tree bindings for S2M family multi-function devices. +More information can be found in bindings/mfd/sec-core.txt file. + +The S2MPS11/13/14/15 and S2MPU02 devices provide buck and LDO regulators. + +To register these with regulator framework instantiate under main device node +a sub-node named "regulators" with more sub-nodes for each regulator using the +common regulator binding documented in: + - Documentation/devicetree/bindings/regulator/regulator.txt + + +Names of regulators supported by different devices: + - LDOn + - valid values for n are: + - S2MPS11: 1 to 38 + - S2MPS13: 1 to 40 + - S2MPS14: 1 to 25 + - S2MPS15: 1 to 27 + - S2MPU02: 1 to 28 + - Example: LDO1, LDO2, LDO28 + - BUCKn + - valid values for n are: + - S2MPS11: 1 to 10 + - S2MPS13: 1 to 10 + - S2MPS14: 1 to 5 + - S2MPS15: 1 to 10 + - S2MPU02: 1 to 7 + - Example: BUCK1, BUCK2, BUCK9 +Note: The 'n' in LDOn and BUCKn represents the LDO or BUCK number +as per the datasheet of device. + + +Optional properties of the nodes under "regulators" sub-node: + - regulator-ramp-delay: ramp delay in uV/us. May be 6250, 12500, + 25000 (default) or 50000. + + Additionally S2MPS11 supports disabling ramp delay for BUCK{2,3,4,6} + by setting it to <0>. + + Note: On S2MPS11 some bucks share the ramp rate setting i.e. same ramp value + will be set for a particular group of bucks so provide the same + regulator-ramp-delay value for them. + Groups sharing ramp rate: + - buck{1,6}, + - buck{3,4}, + - buck{7,8,10}. + + - samsung,ext-control-gpios: On S2MPS14 the LDO10, LDO11 and LDO12 can be + configured to external control over GPIO. To turn this feature on this + property must be added to the regulator sub-node: + - samsung,ext-control-gpios: GPIO specifier for one GPIO + controlling this regulator (enable/disable) + Example: + LDO12 { + regulator-name = "V_EMMC_2.8V"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <2800000>; + samsung,ext-control-gpios = <&gpk0 2 0>; + }; + + +Example: + + s2mps11_pmic@66 { + compatible = "samsung,s2mps11-pmic"; + reg = <0x66>; + + regulators { + ldo1_reg: LDO1 { + regulator-name = "VDD_ABB_3.3V"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; + + ldo2_reg: LDO2 { + regulator-name = "VDD_ALIVE_1.1V"; + regulator-min-microvolt = <1100000>; + regulator-max-microvolt = <1100000>; + regulator-always-on; + }; + + buck1_reg: BUCK1 { + regulator-name = "vdd_mif"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + }; + + buck2_reg: BUCK2 { + regulator-name = "vdd_arm"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + regulator-ramp-delay = <50000>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt new file mode 100644 index 000000000..093edda0c --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt @@ -0,0 +1,145 @@ +Binding for Samsung S5M8767 regulator block +=========================================== + +This is a part of device tree bindings for S5M family multi-function devices. +More information can be found in bindings/mfd/sec-core.txt file. + +The S5M8767 device provide buck and LDO regulators. + +To register these with regulator framework instantiate under main device node +a sub-node named "regulators" with more sub-nodes for each regulator using the +common regulator binding documented in: + - Documentation/devicetree/bindings/regulator/regulator.txt + + +Required properties of the main device node (the parent!): + - s5m8767,pmic-buck2-dvs-voltage: A set of 8 voltage values in micro-volt (uV) + units for buck2 when changing voltage using gpio dvs. Refer to [1] below + for additional information. + + - s5m8767,pmic-buck3-dvs-voltage: A set of 8 voltage values in micro-volt (uV) + units for buck3 when changing voltage using gpio dvs. Refer to [1] below + for additional information. + + - s5m8767,pmic-buck4-dvs-voltage: A set of 8 voltage values in micro-volt (uV) + units for buck4 when changing voltage using gpio dvs. Refer to [1] below + for additional information. + + - s5m8767,pmic-buck-ds-gpios: GPIO specifiers for three host gpio's used + for selecting GPIO DVS lines. It is one-to-one mapped to dvs gpio lines. + + [1] If none of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional + property is specified, the 's5m8767,pmic-buck[2/3/4]-dvs-voltage' + property should specify atleast one voltage level (which would be a + safe operating voltage). + + If either of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional + property is specified, then all the eight voltage values for the + 's5m8767,pmic-buck[2/3/4]-dvs-voltage' should be specified. + +Optional properties of the main device node (the parent!): + - s5m8767,pmic-buck2-uses-gpio-dvs: 'buck2' can be controlled by gpio dvs. + - s5m8767,pmic-buck3-uses-gpio-dvs: 'buck3' can be controlled by gpio dvs. + - s5m8767,pmic-buck4-uses-gpio-dvs: 'buck4' can be controlled by gpio dvs. + +Additional properties required if either of the optional properties are used: + + - s5m8767,pmic-buck234-default-dvs-idx: Default voltage setting selected from + the possible 8 options selectable by the dvs gpios. The value of this + property should be between 0 and 7. If not specified or if out of range, the + default value of this property is set to 0. + + - s5m8767,pmic-buck-dvs-gpios: GPIO specifiers for three host gpio's used + for dvs. The format of the gpio specifier depends in the gpio controller. + + +Names of regulators supported by S5M8767 device: + - LDOn + - valid values for n are 1 to 28 + - Example: LDO1, LDO2, LDO28 + - BUCKn + - valid values for n are 1 to 9. + - Example: BUCK1, BUCK2, BUCK9 +Note: The 'n' in LDOn and BUCKn represents the LDO or BUCK number +as per the datasheet of device. + + +Optional properties of the nodes under "regulators" sub-node: + - op_mode: describes the different operating modes of the LDO's with + power mode change in SOC. The different possible values are, + 0 - always off mode + 1 - on in normal mode + 2 - low power mode + 3 - suspend mode + - s5m8767,pmic-ext-control-gpios: (optional) GPIO specifier for one + GPIO controlling this regulator + (enable/disable); This is valid only + for buck9. + +Example: + + s5m8767_pmic@66 { + compatible = "samsung,s5m8767-pmic"; + reg = <0x66>; + + s5m8767,pmic-buck2-uses-gpio-dvs; + s5m8767,pmic-buck3-uses-gpio-dvs; + s5m8767,pmic-buck4-uses-gpio-dvs; + + s5m8767,pmic-buck-default-dvs-idx = <0>; + + s5m8767,pmic-buck-dvs-gpios = <&gpx0 0 0>, /* DVS1 */ + <&gpx0 1 0>, /* DVS2 */ + <&gpx0 2 0>; /* DVS3 */ + + s5m8767,pmic-buck-ds-gpios = <&gpx2 3 0>, /* SET1 */ + <&gpx2 4 0>, /* SET2 */ + <&gpx2 5 0>; /* SET3 */ + + s5m8767,pmic-buck2-dvs-voltage = <1350000>, <1300000>, + <1250000>, <1200000>, + <1150000>, <1100000>, + <1000000>, <950000>; + + s5m8767,pmic-buck3-dvs-voltage = <1100000>, <1100000>, + <1100000>, <1100000>, + <1000000>, <1000000>, + <1000000>, <1000000>; + + s5m8767,pmic-buck4-dvs-voltage = <1200000>, <1200000>, + <1200000>, <1200000>, + <1200000>, <1200000>, + <1200000>, <1200000>; + + regulators { + ldo1_reg: LDO1 { + regulator-name = "VDD_ABB_3.3V"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + op_mode = <1>; /* Normal Mode */ + }; + + ldo2_reg: LDO2 { + regulator-name = "VDD_ALIVE_1.1V"; + regulator-min-microvolt = <1100000>; + regulator-max-microvolt = <1100000>; + regulator-always-on; + }; + + buck1_reg: BUCK1 { + regulator-name = "VDD_MIF_1.2V"; + regulator-min-microvolt = <950000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + }; + + vemmc_reg: BUCK9 { + regulator-name = "VMEM_VDD_2.8V"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <2800000>; + op_mode = <3>; /* Standby Mode */ + s5m8767,pmic-ext-control-gpios = <&gpk0 2 0>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/reset/hisilicon,hi6220-reset.txt b/Documentation/devicetree/bindings/reset/hisilicon,hi6220-reset.txt new file mode 100644 index 000000000..e0b185a94 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/hisilicon,hi6220-reset.txt @@ -0,0 +1,34 @@ +Hisilicon System Reset Controller +====================================== + +Please also refer to reset.txt in this directory for common reset +controller binding usage. + +The reset controller registers are part of the system-ctl block on +hi6220 SoC. + +Required properties: +- compatible: may be "hisilicon,hi6220-sysctrl" +- reg: should be register base and length as documented in the + datasheet +- #reset-cells: 1, see below + +Example: +sys_ctrl: sys_ctrl@f7030000 { + compatible = "hisilicon,hi6220-sysctrl", "syscon"; + reg = <0x0 0xf7030000 0x0 0x2000>; + #clock-cells = <1>; + #reset-cells = <1>; +}; + +Specifying reset lines connected to IP modules +============================================== +example: + + uart1: serial@..... { + ... + resets = <&sys_ctrl PERIPH_RSTEN3_UART1>; + ... + }; + +The index could be found in . diff --git a/Documentation/devicetree/bindings/rtc/s3c-rtc.txt b/Documentation/devicetree/bindings/rtc/s3c-rtc.txt index ac2fcd6ff..1068ffce9 100644 --- a/Documentation/devicetree/bindings/rtc/s3c-rtc.txt +++ b/Documentation/devicetree/bindings/rtc/s3c-rtc.txt @@ -14,6 +14,10 @@ Required properties: interrupt number is the rtc alarm interrupt and second interrupt number is the rtc tick interrupt. The number of cells representing a interrupt depends on the parent interrupt controller. +- clocks: Must contain a list of phandle and clock specifier for the rtc + and source clocks. +- clock-names: Must contain "rtc" and "rtc_src" entries sorted in the + same order as the clocks property. Example: @@ -21,4 +25,6 @@ Example: compatible = "samsung,s3c6410-rtc"; reg = <0x10070000 0x100>; interrupts = <44 0 45 0>; + clocks = <&clock CLK_RTC>, <&s2mps11_osc S2MPS11_CLK_AP>; + clock-names = "rtc", "rtc_src"; }; diff --git a/Documentation/devicetree/bindings/scsi/hisilicon-sas.txt b/Documentation/devicetree/bindings/scsi/hisilicon-sas.txt new file mode 100644 index 000000000..f67e761bc --- /dev/null +++ b/Documentation/devicetree/bindings/scsi/hisilicon-sas.txt @@ -0,0 +1,69 @@ +* HiSilicon SAS controller + +The HiSilicon SAS controller supports SAS/SATA. + +Main node required properties: + - compatible : value should be as follows: + (a) "hisilicon,hip05-sas-v1" for v1 hw in hip05 chipset + - sas-addr : array of 8 bytes for host SAS address + - reg : Address and length of the SAS register + - hisilicon,sas-syscon: phandle of syscon used for sas control + - ctrl-reset-reg : offset to controller reset register in ctrl reg + - ctrl-reset-sts-reg : offset to controller reset status register in ctrl reg + - ctrl-clock-ena-reg : offset to controller clock enable register in ctrl reg + - queue-count : number of delivery and completion queues in the controller + - phy-count : number of phys accessible by the controller + - interrupts : Interrupts for phys, completion queues, and fatal + sources; the interrupts are ordered in 3 groups, as follows: + - Phy interrupts + - Completion queue interrupts + - Fatal interrupts + Phy interrupts : Each phy has 3 interrupt sources: + - broadcast + - phyup + - abnormal + The phy interrupts are ordered into groups of 3 per phy + (broadcast, phyup, and abnormal) in increasing order. + Completion queue interrupts : each completion queue has 1 + interrupt source. + The interrupts are ordered in increasing order. + Fatal interrupts : the fatal interrupts are ordered as follows: + - ECC + - AXI bus + +Example: + sas0: sas@c1000000 { + compatible = "hisilicon,hip05-sas-v1"; + sas-addr = [50 01 88 20 16 00 00 0a]; + reg = <0x0 0xc1000000 0x0 0x10000>; + hisilicon,sas-syscon = <&pcie_sas>; + ctrl-reset-reg = <0xa60>; + ctrl-reset-sts-reg = <0x5a30>; + ctrl-clock-ena-reg = <0x338>; + queue-count = <32>; + phy-count = <8>; + dma-coherent; + interrupt-parent = <&mbigen_dsa>; + interrupts = <259 4>,<263 4>,<264 4>,/* phy0 */ + <269 4>,<273 4>,<274 4>,/* phy1 */ + <279 4>,<283 4>,<284 4>,/* phy2 */ + <289 4>,<293 4>,<294 4>,/* phy3 */ + <299 4>,<303 4>,<304 4>,/* phy4 */ + <309 4>,<313 4>,<314 4>,/* phy5 */ + <319 4>,<323 4>,<324 4>,/* phy6 */ + <329 4>,<333 4>,<334 4>,/* phy7 */ + <336 1>,<337 1>,<338 1>,/* cq0-2 */ + <339 1>,<340 1>,<341 1>,/* cq3-5 */ + <342 1>,<343 1>,<344 1>,/* cq6-8 */ + <345 1>,<346 1>,<347 1>,/* cq9-11 */ + <348 1>,<349 1>,<350 1>,/* cq12-14 */ + <351 1>,<352 1>,<353 1>,/* cq15-17 */ + <354 1>,<355 1>,<356 1>,/* cq18-20 */ + <357 1>,<358 1>,<359 1>,/* cq21-23 */ + <360 1>,<361 1>,<362 1>,/* cq24-26 */ + <363 1>,<364 1>,<365 1>,/* cq27-29 */ + <366 1>,<367 1>/* cq30-31 */ + <376 4>,/* fatal ecc */ + <381 4>;/* fatal axi */ + status = "disabled"; + }; diff --git a/Documentation/devicetree/bindings/serial/8250.txt b/Documentation/devicetree/bindings/serial/8250.txt index 91d5ab0e6..936ab5b87 100644 --- a/Documentation/devicetree/bindings/serial/8250.txt +++ b/Documentation/devicetree/bindings/serial/8250.txt @@ -14,7 +14,6 @@ Required properties: tegra132, or tegra210. - "nxp,lpc3220-uart" - "ralink,rt2880-uart" - - "ibm,qpace-nwp-serial" - "altr,16550-FIFO32" - "altr,16550-FIFO64" - "altr,16550-FIFO128" diff --git a/Documentation/devicetree/bindings/serial/fsl-imx-uart.txt b/Documentation/devicetree/bindings/serial/fsl-imx-uart.txt index 35ae1fb35..ed94c217c 100644 --- a/Documentation/devicetree/bindings/serial/fsl-imx-uart.txt +++ b/Documentation/devicetree/bindings/serial/fsl-imx-uart.txt @@ -9,7 +9,7 @@ Optional properties: - fsl,uart-has-rtscts : Indicate the uart has rts and cts - fsl,irda-mode : Indicate the uart supports irda mode - fsl,dte-mode : Indicate the uart works in DTE mode. The uart works - is DCE mode by default. + in DCE mode by default. Note: Each uart controller should have an alias correctly numbered in "aliases" node. diff --git a/Documentation/devicetree/bindings/serial/mtk-uart.txt b/Documentation/devicetree/bindings/serial/mtk-uart.txt index 2d47add34..a833a016f 100644 --- a/Documentation/devicetree/bindings/serial/mtk-uart.txt +++ b/Documentation/devicetree/bindings/serial/mtk-uart.txt @@ -2,15 +2,15 @@ Required properties: - compatible should contain: - * "mediatek,mt8135-uart" for MT8135 compatible UARTS + * "mediatek,mt2701-uart" for MT2701 compatible UARTS + * "mediatek,mt6580-uart" for MT6580 compatible UARTS + * "mediatek,mt6582-uart" for MT6582 compatible UARTS + * "mediatek,mt6589-uart" for MT6589 compatible UARTS + * "mediatek,mt6795-uart" for MT6795 compatible UARTS * "mediatek,mt8127-uart" for MT8127 compatible UARTS + * "mediatek,mt8135-uart" for MT8135 compatible UARTS * "mediatek,mt8173-uart" for MT8173 compatible UARTS - * "mediatek,mt6795-uart" for MT6795 compatible UARTS - * "mediatek,mt6589-uart" for MT6589 compatible UARTS - * "mediatek,mt6582-uart" for MT6582 compatible UARTS - * "mediatek,mt6580-uart" for MT6580 compatible UARTS - * "mediatek,mt6577-uart" for all compatible UARTS (MT8173, MT6795, - MT6589, MT6582, MT6580, MT6577) + * "mediatek,mt6577-uart" for MT6577 and all of the above - reg: The base address of the UART register bank. diff --git a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt index 73f825e5e..401b1b33c 100644 --- a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt +++ b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt @@ -2,7 +2,7 @@ Required properties: - - compatible: Must contain one of the following: + - compatible: Must contain one or more of the following: - "renesas,scif-r7s72100" for R7S72100 (RZ/A1H) SCIF compatible UART. - "renesas,scifa-r8a73a4" for R8A73A4 (R-Mobile APE6) SCIFA compatible UART. @@ -15,10 +15,14 @@ Required properties: - "renesas,scifa-r8a7790" for R8A7790 (R-Car H2) SCIFA compatible UART. - "renesas,scifb-r8a7790" for R8A7790 (R-Car H2) SCIFB compatible UART. - "renesas,hscif-r8a7790" for R8A7790 (R-Car H2) HSCIF compatible UART. - - "renesas,scif-r8a7791" for R8A7791 (R-Car M2) SCIF compatible UART. - - "renesas,scifa-r8a7791" for R8A7791 (R-Car M2) SCIFA compatible UART. - - "renesas,scifb-r8a7791" for R8A7791 (R-Car M2) SCIFB compatible UART. - - "renesas,hscif-r8a7791" for R8A7791 (R-Car M2) HSCIF compatible UART. + - "renesas,scif-r8a7791" for R8A7791 (R-Car M2-W) SCIF compatible UART. + - "renesas,scifa-r8a7791" for R8A7791 (R-Car M2-W) SCIFA compatible UART. + - "renesas,scifb-r8a7791" for R8A7791 (R-Car M2-W) SCIFB compatible UART. + - "renesas,hscif-r8a7791" for R8A7791 (R-Car M2-W) HSCIF compatible UART. + - "renesas,scif-r8a7793" for R8A7793 (R-Car M2-N) SCIF compatible UART. + - "renesas,scifa-r8a7793" for R8A7793 (R-Car M2-N) SCIFA compatible UART. + - "renesas,scifb-r8a7793" for R8A7793 (R-Car M2-N) SCIFB compatible UART. + - "renesas,hscif-r8a7793" for R8A7793 (R-Car M2-N) HSCIF compatible UART. - "renesas,scif-r8a7794" for R8A7794 (R-Car E2) SCIF compatible UART. - "renesas,scifa-r8a7794" for R8A7794 (R-Car E2) SCIFA compatible UART. - "renesas,scifb-r8a7794" for R8A7794 (R-Car E2) SCIFB compatible UART. @@ -27,6 +31,14 @@ Required properties: - "renesas,hscif-r8a7795" for R8A7795 (R-Car H3) HSCIF compatible UART. - "renesas,scifa-sh73a0" for SH73A0 (SH-Mobile AG5) SCIFA compatible UART. - "renesas,scifb-sh73a0" for SH73A0 (SH-Mobile AG5) SCIFB compatible UART. + - "renesas,rcar-gen1-scif" for R-Car Gen1 SCIF compatible UART, + - "renesas,rcar-gen2-scif" for R-Car Gen2 SCIF compatible UART, + - "renesas,rcar-gen3-scif" for R-Car Gen3 SCIF compatible UART, + - "renesas,rcar-gen2-scifa" for R-Car Gen2 SCIFA compatible UART, + - "renesas,rcar-gen2-scifb" for R-Car Gen2 SCIFB compatible UART, + - "renesas,rcar-gen1-hscif" for R-Car Gen1 HSCIF compatible UART, + - "renesas,rcar-gen2-hscif" for R-Car Gen2 HSCIF compatible UART, + - "renesas,rcar-gen3-hscif" for R-Car Gen3 HSCIF compatible UART, - "renesas,scif" for generic SCIF compatible UART. - "renesas,scifa" for generic SCIFA compatible UART. - "renesas,scifb" for generic SCIFB compatible UART. @@ -34,15 +46,26 @@ Required properties: - "renesas,sci" for generic SCI compatible UART. When compatible with the generic version, nodes must list the - SoC-specific version corresponding to the platform first followed by the - generic version. + SoC-specific version corresponding to the platform first, followed by the + family-specific and/or generic versions. - reg: Base address and length of the I/O registers used by the UART. - interrupts: Must contain an interrupt-specifier for the SCIx interrupt. - clocks: Must contain a phandle and clock-specifier pair for each entry in clock-names. - - clock-names: Must contain "sci_ick" for the SCIx UART interface clock. + - clock-names: Must contain "fck" for the SCIx UART functional clock. + Apart from the divided functional clock, there may be other possible + sources for the sampling clock, depending on SCIx variant. + On (H)SCI(F) and some SCIFA, an additional clock may be specified: + - "hsck" for the optional external clock input (on HSCIF), + - "sck" for the optional external clock input (on other variants). + On UARTs equipped with a Baud Rate Generator for External Clock (BRG) + (some SCIF and HSCIF), additional clocks may be specified: + - "brg_int" for the optional internal clock source for the frequency + divider (typically the (AXI or SHwy) bus clock), + - "scif_clk" for the optional external clock source for the frequency + divider (SCIF_CLK). Note: Each enabled SCIx UART should have an alias correctly numbered in the "aliases" node. @@ -58,12 +81,13 @@ Example: }; scifa0: serial@e6c40000 { - compatible = "renesas,scifa-r8a7790", "renesas,scifa"; + compatible = "renesas,scifa-r8a7790", + "renesas,rcar-gen2-scifa", "renesas,scifa"; reg = <0 0xe6c40000 0 64>; interrupt-parent = <&gic>; interrupts = <0 144 IRQ_TYPE_LEVEL_HIGH>; clocks = <&mstp2_clks R8A7790_CLK_SCIFA0>; - clock-names = "sci_ick"; + clock-names = "fck"; dmas = <&dmac0 0x21>, <&dmac0 0x22>; dma-names = "tx", "rx"; }; diff --git a/Documentation/devicetree/bindings/soc/bcm/raspberrypi,bcm2835-power.txt b/Documentation/devicetree/bindings/soc/bcm/raspberrypi,bcm2835-power.txt new file mode 100644 index 000000000..30942cf79 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/bcm/raspberrypi,bcm2835-power.txt @@ -0,0 +1,47 @@ +Raspberry Pi power domain driver + +Required properties: + +- compatible: Should be "raspberrypi,bcm2835-power". +- firmware: Reference to the RPi firmware device node. +- #power-domain-cells: Should be <1>, we providing multiple power domains. + +The valid defines for power domain are: + + RPI_POWER_DOMAIN_I2C0 + RPI_POWER_DOMAIN_I2C1 + RPI_POWER_DOMAIN_I2C2 + RPI_POWER_DOMAIN_VIDEO_SCALER + RPI_POWER_DOMAIN_VPU1 + RPI_POWER_DOMAIN_HDMI + RPI_POWER_DOMAIN_USB + RPI_POWER_DOMAIN_VEC + RPI_POWER_DOMAIN_JPEG + RPI_POWER_DOMAIN_H264 + RPI_POWER_DOMAIN_V3D + RPI_POWER_DOMAIN_ISP + RPI_POWER_DOMAIN_UNICAM0 + RPI_POWER_DOMAIN_UNICAM1 + RPI_POWER_DOMAIN_CCP2RX + RPI_POWER_DOMAIN_CSI2 + RPI_POWER_DOMAIN_CPI + RPI_POWER_DOMAIN_DSI0 + RPI_POWER_DOMAIN_DSI1 + RPI_POWER_DOMAIN_TRANSPOSER + RPI_POWER_DOMAIN_CCP2TX + RPI_POWER_DOMAIN_CDP + RPI_POWER_DOMAIN_ARM + +Example: + +power: power { + compatible = "raspberrypi,bcm2835-power"; + firmware = <&firmware>; + #power-domain-cells = <1>; +}; + +Example for using power domain: + +&usb { + power-domains = <&power RPI_POWER_DOMAIN_USB>; +}; diff --git a/Documentation/devicetree/bindings/soc/dove/pmu.txt b/Documentation/devicetree/bindings/soc/dove/pmu.txt new file mode 100644 index 000000000..edd40b796 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/dove/pmu.txt @@ -0,0 +1,56 @@ +Device Tree bindings for Marvell PMU + +Required properties: + - compatible: value should be "marvell,dove-pmu". + May also include "simple-bus" if there are child devices, in which + case the ranges node is required. + - reg: two base addresses and sizes of the PM controller and PMU. + - interrupts: single interrupt number for the PMU interrupt + - interrupt-controller: must be specified as the PMU itself is an + interrupt controller. + - #interrupt-cells: must be 1. + - #reset-cells: must be 1. + - domains: sub-node containing domain descriptions + +Optional properties: + - ranges: defines the address mapping for child devices, as per the + standard property of this name. Required when compatible includes + "simple-bus". + +Power domain descriptions are listed as child nodes of the "domains" +sub-node. Each domain has the following properties: + +Required properties: + - #power-domain-cells: must be 0. + +Optional properties: + - marvell,pmu_pwr_mask: specifies the mask value for PMU power register + - marvell,pmu_iso_mask: specifies the mask value for PMU isolation register + - resets: points to the reset manager (PMU node) and reset index. + +Example: + + pmu: power-management@d0000 { + compatible = "marvell,dove-pmu"; + reg = <0xd0000 0x8000>, <0xd8000 0x8000>; + interrupts = <33>; + interrupt-controller; + #interrupt-cells = <1>; + #reset-cells = <1>; + + domains { + vpu_domain: vpu-domain { + #power-domain-cells = <0>; + marvell,pmu_pwr_mask = <0x00000008>; + marvell,pmu_iso_mask = <0x00000001>; + resets = <&pmu 16>; + }; + + gpu_domain: gpu-domain { + #power-domain-cells = <0>; + marvell,pmu_pwr_mask = <0x00000004>; + marvell,pmu_iso_mask = <0x00000002>; + resets = <&pmu 18>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/soc/mediatek/scpsys.txt b/Documentation/devicetree/bindings/soc/mediatek/scpsys.txt index a6c8afc83..e8f15e340 100644 --- a/Documentation/devicetree/bindings/soc/mediatek/scpsys.txt +++ b/Documentation/devicetree/bindings/soc/mediatek/scpsys.txt @@ -21,6 +21,18 @@ Required properties: These are the clocks which hardware needs to be enabled before enabling certain power domains. +Optional properties: +- vdec-supply: Power supply for the vdec power domain +- venc-supply: Power supply for the venc power domain +- isp-supply: Power supply for the isp power domain +- mm-supply: Power supply for the mm power domain +- venc_lt-supply: Power supply for the venc_lt power domain +- audio-supply: Power supply for the audio power domain +- usb-supply: Power supply for the usb power domain +- mfg_async-supply: Power supply for the mfg_async power domain +- mfg_2d-supply: Power supply for the mfg_2d power domain +- mfg-supply: Power supply for the mfg power domain + Example: scpsys: scpsys@10006000 { diff --git a/Documentation/devicetree/bindings/soc/qcom,smd-rpm.txt b/Documentation/devicetree/bindings/soc/qcom,smd-rpm.txt deleted file mode 100644 index e27f5c4c5..000000000 --- a/Documentation/devicetree/bindings/soc/qcom,smd-rpm.txt +++ /dev/null @@ -1,117 +0,0 @@ -Qualcomm Resource Power Manager (RPM) over SMD - -This driver is used to interface with the Resource Power Manager (RPM) found in -various Qualcomm platforms. The RPM allows each component in the system to vote -for state of the system resources, such as clocks, regulators and bus -frequencies. - -- compatible: - Usage: required - Value type: - Definition: must be one of: - "qcom,rpm-msm8974" - -- qcom,smd-channels: - Usage: required - Value type: - Definition: Shared Memory channel used for communication with the RPM - -= SUBDEVICES - -The RPM exposes resources to its subnodes. The below bindings specify the set -of valid subnodes that can operate on these resources. - -== Regulators - -Regulator nodes are identified by their compatible: - -- compatible: - Usage: required - Value type: - Definition: must be one of: - "qcom,rpm-pm8841-regulators" - "qcom,rpm-pm8941-regulators" - -- vdd_s1-supply: -- vdd_s2-supply: -- vdd_s3-supply: -- vdd_s4-supply: -- vdd_s5-supply: -- vdd_s6-supply: -- vdd_s7-supply: -- vdd_s8-supply: - Usage: optional (pm8841 only) - Value type: - Definition: reference to regulator supplying the input pin, as - described in the data sheet - -- vdd_s1-supply: -- vdd_s2-supply: -- vdd_s3-supply: -- vdd_l1_l3-supply: -- vdd_l2_lvs1_2_3-supply: -- vdd_l4_l11-supply: -- vdd_l5_l7-supply: -- vdd_l6_l12_l14_l15-supply: -- vdd_l8_l16_l18_l19-supply: -- vdd_l9_l10_l17_l22-supply: -- vdd_l13_l20_l23_l24-supply: -- vdd_l21-supply: -- vin_5vs-supply: - Usage: optional (pm8941 only) - Value type: - Definition: reference to regulator supplying the input pin, as - described in the data sheet - -The regulator node houses sub-nodes for each regulator within the device. Each -sub-node is identified using the node's name, with valid values listed for each -of the pmics below. - -pm8841: - s1, s2, s3, s4, s5, s6, s7, s8 - -pm8941: - s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, - l14, l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, lvs1, lvs2, - lvs3, 5vs1, 5vs2 - -The content of each sub-node is defined by the standard binding for regulators - -see regulator.txt. - -= EXAMPLE - - smd { - compatible = "qcom,smd"; - - rpm { - interrupts = <0 168 1>; - qcom,ipc = <&apcs 8 0>; - qcom,smd-edge = <15>; - - rpm_requests { - compatible = "qcom,rpm-msm8974"; - qcom,smd-channels = "rpm_requests"; - - pm8941-regulators { - compatible = "qcom,rpm-pm8941-regulators"; - vdd_l13_l20_l23_l24-supply = <&pm8941_boost>; - - pm8941_s3: s3 { - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - }; - - pm8941_boost: s4 { - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - }; - - pm8941_l20: l20 { - regulator-min-microvolt = <2950000>; - regulator-max-microvolt = <2950000>; - }; - }; - }; - }; - }; - diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,smd-rpm.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,smd-rpm.txt new file mode 100644 index 000000000..a48049ccf --- /dev/null +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,smd-rpm.txt @@ -0,0 +1,58 @@ +Qualcomm Resource Power Manager (RPM) over SMD + +This driver is used to interface with the Resource Power Manager (RPM) found in +various Qualcomm platforms. The RPM allows each component in the system to vote +for state of the system resources, such as clocks, regulators and bus +frequencies. + +The SMD information for the RPM edge should be filled out. See qcom,smd.txt for +the required edge properties. All SMD related properties will reside within the +RPM node itself. + += SUBDEVICES + +The RPM exposes resources to its subnodes. The rpm_requests node must be +present and this subnode may contain children that designate regulator +resources. + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,rpm-apq8084" + "qcom,rpm-msm8916" + "qcom,rpm-msm8974" + +- qcom,smd-channels: + Usage: required + Value type: + Definition: must be "rpm_requests" + +Refer to Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt +for information on the regulator subnodes that can exist under the rpm_requests. + +Example: + + soc { + apcs: syscon@f9011000 { + compatible = "syscon"; + reg = <0xf9011000 0x1000>; + }; + }; + + smd { + compatible = "qcom,smd"; + + rpm { + interrupts = <0 168 1>; + qcom,ipc = <&apcs 8 0>; + qcom,smd-edge = <15>; + + rpm_requests { + compatible = "qcom,rpm-msm8974"; + qcom,smd-channels = "rpm_requests"; + + ... + }; + }; + }; diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,smp2p.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,smp2p.txt new file mode 100644 index 000000000..5cc82b835 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,smp2p.txt @@ -0,0 +1,104 @@ +Qualcomm Shared Memory Point 2 Point binding + +The Shared Memory Point to Point (SMP2P) protocol facilitates communication of +a single 32-bit value between two processors. Each value has a single writer +(the local side) and a single reader (the remote side). Values are uniquely +identified in the system by the directed edge (local processor ID to remote +processor ID) and a string identifier. + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,smp2p" + +- interrupts: + Usage: required + Value type: + Definition: one entry specifying the smp2p notification interrupt + +- qcom,ipc: + Usage: required + Value type: + Definition: three entries specifying the outgoing ipc bit used for + signaling the remote end of the smp2p edge: + - phandle to a syscon node representing the apcs registers + - u32 representing offset to the register within the syscon + - u32 representing the ipc bit within the register + +- qcom,smem: + Usage: required + Value type: + Definition: two identifiers of the inbound and outbound smem items used + for this edge + +- qcom,local-pid: + Usage: required + Value type: + Definition: specifies the identfier of the local endpoint of this edge + +- qcom,remote-pid: + Usage: required + Value type: + Definition: specifies the identfier of the remote endpoint of this edge + += SUBNODES +Each SMP2P pair contain a set of inbound and outbound entries, these are +described in subnodes of the smp2p device node. The node names are not +important. + +- qcom,entry-name: + Usage: required + Value type: + Definition: specifies the name of this entry, for inbound entries this + will be used to match against the remotely allocated entry + and for outbound entries this name is used for allocating + entries + +- interrupt-controller: + Usage: required for incoming entries + Value type: + Definition: marks the entry as inbound; the node should be specified + as a two cell interrupt-controller as defined in + "../interrupt-controller/interrupts.txt" + If not specified this node will denote the outgoing entry + +- #interrupt-cells: + Usage: required for incoming entries + Value type: + Definition: must be 2 - denoting the bit in the entry and IRQ flags + +- #qcom,state-cells: + Usage: required for outgoing entries + Value type: + Definition: must be 1 - denoting the bit in the entry + += EXAMPLE +The following example shows the SMP2P setup with the wireless processor, +defined from the 8974 apps processor's point-of-view. It encompasses one +inbound and one outbound entry: + +wcnss-smp2p { + compatible = "qcom,smp2p"; + qcom,smem = <431>, <451>; + + interrupts = <0 143 1>; + + qcom,ipc = <&apcs 8 18>; + + qcom,local-pid = <0>; + qcom,remote-pid = <4>; + + wcnss_smp2p_out: master-kernel { + qcom,entry-name = "master-kernel"; + + #qcom,state-cells = <1>; + }; + + wcnss_smp2p_in: slave-kernel { + qcom,entry-name = "slave-kernel"; + + interrupt-controller; + #interrupt-cells = <2>; + }; +}; diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,smsm.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,smsm.txt new file mode 100644 index 000000000..a6634c708 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,smsm.txt @@ -0,0 +1,104 @@ +Qualcomm Shared Memory State Machine + +The Shared Memory State Machine facilitates broadcasting of single bit state +information between the processors in a Qualcomm SoC. Each processor is +assigned 32 bits of state that can be modified. A processor can through a +matrix of bitmaps signal subscription of notifications upon changes to a +certain bit owned by a certain remote processor. + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,smsm" + +- qcom,ipc-N: + Usage: required + Value type: + Definition: three entries specifying the outgoing ipc bit used for + signaling the N:th remote processor + - phandle to a syscon node representing the apcs registers + - u32 representing offset to the register within the syscon + - u32 representing the ipc bit within the register + +- qcom,local-host: + Usage: optional + Value type: + Definition: identifier of the local processor in the list of hosts, or + in other words specifier of the column in the subscription + matrix representing the local processor + defaults to host 0 + +- #address-cells: + Usage: required + Value type: + Definition: must be 1 + +- #size-cells: + Usage: required + Value type: + Definition: must be 0 + += SUBNODES +Each processor's state bits are described by a subnode of the smsm device node. +Nodes can either be flagged as an interrupt-controller to denote a remote +processor's state bits or the local processors bits. The node names are not +important. + +- reg: + Usage: required + Value type: + Definition: specifies the offset, in words, of the first bit for this + entry + +- #qcom,state-cells: + Usage: required for local entry + Value type: + Definition: must be 1 - denotes bit number + +- interrupt-controller: + Usage: required for remote entries + Value type: + Definition: marks the entry as a interrupt-controller and the state bits + to belong to a remote processor + +- #interrupt-cells: + Usage: required for remote entries + Value type: + Definition: must be 2 - denotes bit number and IRQ flags + +- interrupts: + Usage: required for remote entries + Value type: + Definition: one entry specifying remote IRQ used by the remote processor + to signal changes of its state bits + + += EXAMPLE +The following example shows the SMEM setup for controlling properties of the +wireless processor, defined from the 8974 apps processor's point-of-view. It +encompasses one outbound entry and the outgoing interrupt for the wireless +processor. + +smsm { + compatible = "qcom,smsm"; + + #address-cells = <1>; + #size-cells = <0>; + + qcom,ipc-3 = <&apcs 8 19>; + + apps_smsm: apps@0 { + reg = <0>; + + #qcom,state-cells = <1>; + }; + + wcnss_smsm: wcnss@7 { + reg = <7>; + interrupts = <0 144 1>; + + interrupt-controller; + #interrupt-cells = <2>; + }; +}; diff --git a/Documentation/devicetree/bindings/soc/sunxi/sram.txt b/Documentation/devicetree/bindings/soc/sunxi/sram.txt deleted file mode 100644 index 067698112..000000000 --- a/Documentation/devicetree/bindings/soc/sunxi/sram.txt +++ /dev/null @@ -1,72 +0,0 @@ -Allwinnner SoC SRAM controllers ------------------------------------------------------ - -The SRAM controller found on most Allwinner devices is represented by -a regular node for the SRAM controller itself, with sub-nodes -reprensenting the SRAM handled by the SRAM controller. - -Controller Node ---------------- - -Required properties: -- compatible : "allwinner,sun4i-a10-sram-controller" -- reg : sram controller register offset + length - -SRAM nodes ----------- - -Each SRAM is described using the mmio-sram bindings documented in -Documentation/devicetree/bindings/misc/sram.txt - -Each SRAM will have SRAM sections that are going to be handled by the -SRAM controller as subnodes. These sections are represented following -once again the representation described in the mmio-sram binding. - -The valid sections compatible are: - - allwinner,sun4i-a10-sram-a3-a4 - - allwinner,sun4i-a10-sram-d - -Devices using SRAM sections ---------------------------- - -Some devices need to request to the SRAM controller to map an SRAM for -their exclusive use. - -The relationship between such a device and an SRAM section is -expressed through the allwinner,sram property, that will take a -phandle and an argument. - -This valid values for this argument are: - - 0: CPU - - 1: Device - -Example -------- -sram-controller@01c00000 { - compatible = "allwinner,sun4i-a10-sram-controller"; - reg = <0x01c00000 0x30>; - #address-cells = <1>; - #size-cells = <1>; - ranges; - - sram_a: sram@00000000 { - compatible = "mmio-sram"; - reg = <0x00000000 0xc000>; - #address-cells = <1>; - #size-cells = <1>; - ranges = <0 0x00000000 0xc000>; - - emac_sram: sram-section@8000 { - compatible = "allwinner,sun4i-a10-sram-a3-a4"; - reg = <0x8000 0x4000>; - status = "disabled"; - }; - }; -}; - -emac: ethernet@01c0b000 { - compatible = "allwinner,sun4i-a10-emac"; - ... - - allwinner,sram = <&emac_sram 1>; -}; diff --git a/Documentation/devicetree/bindings/soc/ti/wkup_m3_ipc.txt b/Documentation/devicetree/bindings/soc/ti/wkup_m3_ipc.txt new file mode 100644 index 000000000..401550487 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/ti/wkup_m3_ipc.txt @@ -0,0 +1,57 @@ +Wakeup M3 IPC Driver +===================== + +The TI AM33xx and AM43xx family of devices use a small Cortex M3 co-processor +(commonly referred to as Wakeup M3 or CM3) to help with various low power tasks +that cannot be controlled from the MPU, like suspend/resume and certain deep +C-states for CPU Idle. Once the wkup_m3_ipc driver uses the wkup_m3_rproc driver +to boot the wkup_m3, it handles communication with the CM3 using IPC registers +present in the SoC's control module and a mailbox. The wkup_m3_ipc exposes an +API to allow the SoC PM code to execute specific PM tasks. + +Wkup M3 Device Node: +==================== +A wkup_m3_ipc device node is used to represent the IPC registers within an +SoC. + +Required properties: +-------------------- +- compatible: Should be, + "ti,am3352-wkup-m3-ipc" for AM33xx SoCs + "ti,am4372-wkup-m3-ipc" for AM43xx SoCs +- reg: Contains the IPC register address space to communicate + with the Wakeup M3 processor +- interrupts: Contains the interrupt information for the wkup_m3 + interrupt that signals the MPU. +- ti,rproc: phandle to the wkup_m3 rproc node so the IPC driver + can boot it. +- mboxes: phandles used by IPC framework to get correct mbox + channel for communication. Must point to appropriate + mbox_wkupm3 child node. + +Example: +-------- +/* AM33xx */ + l4_wkup: l4_wkup@44c00000 { + ... + + scm: scm@210000 { + compatible = "ti,am3-scm", "simple-bus"; + reg = <0x210000 0x2000>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x210000 0x2000>; + + ... + + wkup_m3_ipc: wkup_m3_ipc@1324 { + compatible = "ti,am3352-wkup-m3-ipc"; + reg = <0x1324 0x24>; + interrupts = <78>; + ti,rproc = <&wkup_m3>; + mboxes = <&mailbox &mbox_wkupm3>; + }; + + ... + }; + }; diff --git a/Documentation/devicetree/bindings/sound/ak4613.txt b/Documentation/devicetree/bindings/sound/ak4613.txt index 15a919522..1783f9ef0 100644 --- a/Documentation/devicetree/bindings/sound/ak4613.txt +++ b/Documentation/devicetree/bindings/sound/ak4613.txt @@ -7,6 +7,16 @@ Required properties: - compatible : "asahi-kasei,ak4613" - reg : The chip select number on the I2C bus +Optional properties: +- asahi-kasei,in1-single-end : Boolean. Indicate input / output pins are single-ended. +- asahi-kasei,in2-single-end rather than differential. +- asahi-kasei,out1-single-end +- asahi-kasei,out2-single-end +- asahi-kasei,out3-single-end +- asahi-kasei,out4-single-end +- asahi-kasei,out5-single-end +- asahi-kasei,out6-single-end + Example: &i2c { diff --git a/Documentation/devicetree/bindings/sound/atmel-classd.txt b/Documentation/devicetree/bindings/sound/atmel-classd.txt index 0018451c4..549e701cb 100644 --- a/Documentation/devicetree/bindings/sound/atmel-classd.txt +++ b/Documentation/devicetree/bindings/sound/atmel-classd.txt @@ -16,6 +16,10 @@ Required properties: Required elements: "pclk", "gclk" and "aclk". - clocks Please refer to clock-bindings.txt. +- assigned-clocks + Should be <&classd_gclk>. +- assigned-clock-parents + Should be <&audio_pll_pmc>. Optional properties: - pinctrl-names, pinctrl-0 @@ -43,6 +47,8 @@ classd: classd@fc048000 { dma-names = "tx"; clocks = <&classd_clk>, <&classd_gclk>, <&audio_pll_pmc>; clock-names = "pclk", "gclk", "aclk"; + assigned-clocks = <&classd_gclk>; + assigned-clock-parents = <&audio_pll_pmc>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_classd_default>; diff --git a/Documentation/devicetree/bindings/sound/atmel-pdmic.txt b/Documentation/devicetree/bindings/sound/atmel-pdmic.txt new file mode 100644 index 000000000..e0875f17c --- /dev/null +++ b/Documentation/devicetree/bindings/sound/atmel-pdmic.txt @@ -0,0 +1,55 @@ +* Atmel PDMIC driver under ALSA SoC architecture + +Required properties: +- compatible + Should be "atmel,sama5d2-pdmic". +- reg + Should contain PDMIC registers location and length. +- interrupts + Should contain the IRQ line for the PDMIC. +- dmas + One DMA specifiers as described in atmel-dma.txt and dma.txt files. +- dma-names + Must be "rx". +- clock-names + Required elements: + - "pclk" peripheral clock + - "gclk" generated clock +- clocks + Must contain an entry for each required entry in clock-names. + Please refer to clock-bindings.txt. +- atmel,mic-min-freq + The minimal frequency that the micphone supports. +- atmel,mic-max-freq + The maximal frequency that the micphone supports. + +Optional properties: +- pinctrl-names, pinctrl-0 + Please refer to pinctrl-bindings.txt. +- atmel,model + The user-visible name of this sound card. + The default value is "PDMIC". +- atmel,mic-offset + The offset that should be added. + The range is from -32768 to 32767. + The default value is 0. + +Example: + pdmic@f8018000 { + compatible = "atmel,sama5d2-pdmic"; + reg = <0xf8018000 0x124>; + interrupts = <48 IRQ_TYPE_LEVEL_HIGH 7>; + dmas = <&dma0 + (AT91_XDMAC_DT_MEM_IF(0) | AT91_XDMAC_DT_PER_IF(1) + | AT91_XDMAC_DT_PERID(50))>; + dma-names = "rx"; + clocks = <&pdmic_clk>, <&pdmic_gclk>; + clock-names = "pclk", "gclk"; + + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pdmic_default>; + atmel,model = "PDMIC @ sama5d2_xplained"; + atmel,mic-min-freq = <1000000>; + atmel,mic-max-freq = <3246000>; + atmel,mic-offset = <0x0>; + }; diff --git a/Documentation/devicetree/bindings/sound/da7218.txt b/Documentation/devicetree/bindings/sound/da7218.txt new file mode 100644 index 000000000..5ca5a709b --- /dev/null +++ b/Documentation/devicetree/bindings/sound/da7218.txt @@ -0,0 +1,104 @@ +Dialog Semiconductor DA7218 Audio Codec bindings + +DA7218 is an audio codec with HP detect feature. + +====== + +Required properties: +- compatible : Should be "dlg,da7217" or "dlg,da7218" +- reg: Specifies the I2C slave address + +- VDD-supply: VDD power supply for the device +- VDDMIC-supply: VDDMIC power supply for the device +- VDDIO-supply: VDDIO power supply for the device + (See Documentation/devicetree/bindings/regulator/regulator.txt for further + information relating to regulators) + +Optional properties: +- interrupt-parent: Specifies the phandle of the interrupt controller to which + the IRQs from DA7218 are delivered to. +- interrupts: IRQ line info for DA7218 chip. + (See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt for + further information relating to interrupt properties) +- interrupt-names : Name associated with interrupt line. Should be "wakeup" if + interrupt is to be used to wake system, otherwise "irq" should be used. +- wakeup-source: Flag to indicate this device can wake system (suspend/resume). + +- clocks : phandle and clock specifier for codec MCLK. +- clock-names : Clock name string for 'clocks' attribute, should be "mclk". + +- dlg,micbias1-lvl-millivolt : Voltage (mV) for Mic Bias 1 + [<1200>, <1600>, <1800>, <2000>, <2200>, <2400>, <2600>, <2800>, <3000>] +- dlg,micbias2-lvl-millivolt : Voltage (mV) for Mic Bias 2 + [<1200>, <1600>, <1800>, <2000>, <2200>, <2400>, <2600>, <2800>, <3000>] +- dlg,mic1-amp-in-sel : Mic1 input source type + ["diff", "se_p", "se_n"] +- dlg,mic2-amp-in-sel : Mic2 input source type + ["diff", "se_p", "se_n"] +- dlg,dmic1-data-sel : DMIC1 channel select based on clock edge. + ["lrise_rfall", "lfall_rrise"] +- dlg,dmic1-samplephase : When to sample audio from DMIC1. + ["on_clkedge", "between_clkedge"] +- dlg,dmic1-clkrate-hz : DMic1 clock frequency (Hz). + [<1500000>, <3000000>] +- dlg,dmic2-data-sel : DMic2 channel select based on clock edge. + ["lrise_rfall", "lfall_rrise"] +- dlg,dmic2-samplephase : When to sample audio from DMic2. + ["on_clkedge", "between_clkedge"] +- dlg,dmic2-clkrate-hz : DMic2 clock frequency (Hz). + [<1500000>, <3000000>] +- dlg,hp-diff-single-supply : Boolean flag, use single supply for HP + (DA7217 only) + +====== + +Optional Child node - 'da7218_hpldet' (DA7218 only): + +Optional properties: +- dlg,jack-rate-us : Time between jack detect measurements (us) + [<5>, <10>, <20>, <40>, <80>, <160>, <320>, <640>] +- dlg,jack-debounce : Number of debounce measurements taken for jack detect + [<0>, <2>, <3>, <4>] +- dlg,jack-threshold-pct : Threshold level for jack detection (% of VDD) + [<84>, <88>, <92>, <96>] +- dlg,comp-inv : Boolean flag, invert comparator output +- dlg,hyst : Boolean flag, enable hysteresis +- dlg,discharge : Boolean flag, auto discharge of Mic Bias on jack removal + +====== + +Example: + + codec: da7218@1a { + compatible = "dlg,da7218"; + reg = <0x1a>; + interrupt-parent = <&gpio6>; + interrupts = <11 IRQ_TYPE_LEVEL_HIGH>; + wakeup-source; + + VDD-supply = <®_audio>; + VDDMIC-supply = <®_audio>; + VDDIO-supply = <®_audio>; + + clocks = <&clks 201>; + clock-names = "mclk"; + + dlg,micbias1-lvl-millivolt = <2600>; + dlg,micbias2-lvl-millivolt = <2600>; + dlg,mic1-amp-in-sel = "diff"; + dlg,mic2-amp-in-sel = "diff"; + + dlg,dmic1-data-sel = "lrise_rfall"; + dlg,dmic1-samplephase = "on_clkedge"; + dlg,dmic1-clkrate-hz = <3000000>; + dlg,dmic2-data-sel = "lrise_rfall"; + dlg,dmic2-samplephase = "on_clkedge"; + dlg,dmic2-clkrate-hz = <3000000>; + + da7218_hpldet { + dlg,jack-rate-us = <40>; + dlg,jack-debounce = <2>; + dlg,jack-threshold-pct = <84>; + dlg,hyst; + }; + }; diff --git a/Documentation/devicetree/bindings/sound/da7219.txt b/Documentation/devicetree/bindings/sound/da7219.txt index 1b7030911..cf6168182 100644 --- a/Documentation/devicetree/bindings/sound/da7219.txt +++ b/Documentation/devicetree/bindings/sound/da7219.txt @@ -28,13 +28,15 @@ Optional properties: - clocks : phandle and clock specifier for codec MCLK. - clock-names : Clock name string for 'clocks' attribute, should be "mclk". -- dlg,ldo-lvl : Required internal LDO voltage (mV) level for digital engine - [<1050>, <1100>, <1200>, <1400>] - dlg,micbias-lvl : Voltage (mV) for Mic Bias - [<1800>, <2000>, <2200>, <2400>, <2600>] + [<1600>, <1800>, <2000>, <2200>, <2400>, <2600>] - dlg,mic-amp-in-sel : Mic input source type ["diff", "se_p", "se_n"] +Deprecated properties: +- dlg,ldo-lvl : Required internal LDO voltage (mV) level for digital engine + (LDO unavailable in production HW so property no longer required). + ====== Child node - 'da7219_aad': diff --git a/Documentation/devicetree/bindings/sound/fsl,asrc.txt b/Documentation/devicetree/bindings/sound/fsl,asrc.txt index b93362a57..3e26a9478 100644 --- a/Documentation/devicetree/bindings/sound/fsl,asrc.txt +++ b/Documentation/devicetree/bindings/sound/fsl,asrc.txt @@ -25,6 +25,11 @@ Required properties: "mem" Peripheral access clock to access registers. "ipg" Peripheral clock to driver module. "asrck_<0-f>" Clock sources for input and output clock. + "spba" The spba clock is required when ASRC is placed as a + bus slave of the Shared Peripheral Bus and when two + or more bus masters (CPU, DMA or DSP) try to access + it. This property is optional depending on the SoC + design. - big-endian : If this property is absent, the little endian mode will be in use as default. Otherwise, the big endian diff --git a/Documentation/devicetree/bindings/sound/fsl,esai.txt b/Documentation/devicetree/bindings/sound/fsl,esai.txt index d3b6b5f48..cd3ee5d84 100644 --- a/Documentation/devicetree/bindings/sound/fsl,esai.txt +++ b/Documentation/devicetree/bindings/sound/fsl,esai.txt @@ -27,6 +27,11 @@ Required properties: derive HCK, SCK and FS. "fsys" The system clock derived from ahb clock used to derive HCK, SCK and FS. + "spba" The spba clock is required when ESAI is placed as a + bus slave of the Shared Peripheral Bus and when two + or more bus masters (CPU, DMA or DSP) try to access + it. This property is optional depending on the SoC + design. - fsl,fifo-depth : The number of elements in the transmit and receive FIFOs. This number is the maximum allowed value for diff --git a/Documentation/devicetree/bindings/sound/fsl,spdif.txt b/Documentation/devicetree/bindings/sound/fsl,spdif.txt index b5ee32ee3..4ca39ddc0 100644 --- a/Documentation/devicetree/bindings/sound/fsl,spdif.txt +++ b/Documentation/devicetree/bindings/sound/fsl,spdif.txt @@ -27,6 +27,11 @@ Required properties: Transceiver Clock Diagram" of SoC reference manual. It can also be referred to TxClk_Source bit of register SPDIF_STC. + "spba" The spba clock is required when SPDIF is placed as a + bus slave of the Shared Peripheral Bus and when two + or more bus masters (CPU, DMA or DSP) try to access + it. This property is optional depending on the SoC + design. - big-endian : If this property is absent, the native endian mode will be in use as default, or the big endian mode diff --git a/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt b/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt index ce55c0a6f..4da41bf18 100644 --- a/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt +++ b/Documentation/devicetree/bindings/sound/fsl-asoc-card.txt @@ -30,6 +30,8 @@ The compatible list for this generic sound card currently: "fsl,imx-audio-sgtl5000" (compatible with Documentation/devicetree/bindings/sound/imx-audio-sgtl5000.txt) + "fsl,imx-audio-wm8960" + Required properties: - compatible : Contains one of entries in the compatible list. diff --git a/Documentation/devicetree/bindings/sound/img,i2s-in.txt b/Documentation/devicetree/bindings/sound/img,i2s-in.txt new file mode 100644 index 000000000..423265cfc --- /dev/null +++ b/Documentation/devicetree/bindings/sound/img,i2s-in.txt @@ -0,0 +1,47 @@ +Imagination Technologies I2S Input Controller + +Required Properties: + + - compatible : Compatible list, must contain "img,i2s-in" + + - #sound-dai-cells : Must be equal to 0 + + - reg : Offset and length of the register set for the device + + - clocks : Contains an entry for each entry in clock-names + + - clock-names : Must include the following entry: + "sys" The system clock + + - dmas: Contains an entry for each entry in dma-names. + + - dma-names: Must include the following entry: + "rx" Single DMA channel used by all active I2S channels + + - img,i2s-channels : Number of I2S channels instantiated in the I2S in block + +Optional Properties: + + - interrupts : Contains the I2S in interrupts. Depending on + the configuration, there may be no interrupts, one interrupt, + or an interrupt per I2S channel. For the case where there is + one interrupt per channel, the interrupts should be listed + in ascending channel order + + - resets: Contains a phandle to the I2S in reset signal + + - reset-names: Contains the reset signal name "rst" + +Example: + +i2s_in: i2s-in@18100800 { + compatible = "img,i2s-in"; + reg = <0x18100800 0x200>; + interrupts = ; + dmas = <&mdc 30 0xffffffff 0>; + dma-names = "rx"; + clocks = <&cr_periph SYS_CLK_I2S_IN>; + clock-names = "sys"; + img,i2s-channels = <6>; + #sound-dai-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/sound/img,i2s-out.txt b/Documentation/devicetree/bindings/sound/img,i2s-out.txt new file mode 100644 index 000000000..0159415b3 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/img,i2s-out.txt @@ -0,0 +1,51 @@ +Imagination Technologies I2S Output Controller + +Required Properties: + + - compatible : Compatible list, must contain "img,i2s-out" + + - #sound-dai-cells : Must be equal to 0 + + - reg : Offset and length of the register set for the device + + - clocks : Contains an entry for each entry in clock-names + + - clock-names : Must include the following entries: + "sys" The system clock + "ref" The reference clock + + - dmas: Contains an entry for each entry in dma-names. + + - dma-names: Must include the following entry: + "tx" Single DMA channel used by all active I2S channels + + - img,i2s-channels : Number of I2S channels instantiated in the I2S out block + + - resets: Contains a phandle to the I2S out reset signal + + - reset-names: Contains the reset signal name "rst" + +Optional Properties: + + - interrupts : Contains the I2S out interrupts. Depending on + the configuration, there may be no interrupts, one interrupt, + or an interrupt per I2S channel. For the case where there is + one interrupt per channel, the interrupts should be listed + in ascending channel order + +Example: + +i2s_out: i2s-out@18100A00 { + compatible = "img,i2s-out"; + reg = <0x18100A00 0x200>; + interrupts = ; + dmas = <&mdc 23 0xffffffff 0>; + dma-names = "tx"; + clocks = <&cr_periph SYS_CLK_I2S_OUT>, + <&clk_core CLK_I2S>; + clock-names = "sys", "ref"; + img,i2s-channels = <6>; + resets = <&pistachio_reset PISTACHIO_RESET_I2S_OUT>; + reset-names = "rst"; + #sound-dai-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/sound/img,parallel-out.txt b/Documentation/devicetree/bindings/sound/img,parallel-out.txt new file mode 100644 index 000000000..a3015d2a0 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/img,parallel-out.txt @@ -0,0 +1,44 @@ +Imagination Technologies Parallel Output Controller + +Required Properties: + + - compatible : Compatible list, must contain "img,parallel-out". + + - #sound-dai-cells : Must be equal to 0 + + - reg : Offset and length of the register set for the device. + + - dmas: Contains an entry for each entry in dma-names. + + - dma-names: Must include the following entry: + "tx" + + - clocks : Contains an entry for each entry in clock-names. + + - clock-names : Includes the following entries: + "sys" The system clock + "ref" The reference clock + + - resets: Contains a phandle to the parallel out reset signal + + - reset-names: Contains the reset signal name "rst" + +Optional Properties: + + - interrupts : Contains the parallel out interrupt, if present + +Example: + +parallel_out: parallel-out@18100C00 { + compatible = "img,parallel-out"; + reg = <0x18100C00 0x100>; + interrupts = ; + dmas = <&mdc 16 0xffffffff 0>; + dma-names = "tx"; + clocks = <&cr_periph SYS_CLK_PAUD_OUT>, + <&clk_core CLK_AUDIO_DAC>; + clock-names = "sys", "ref"; + resets = <&pistachio_reset PISTACHIO_RESET_PRL_OUT>; + reset-names = "rst"; + #sound-dai-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/sound/img,pistachio-internal-dac.txt b/Documentation/devicetree/bindings/sound/img,pistachio-internal-dac.txt new file mode 100644 index 000000000..4cc18fc04 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/img,pistachio-internal-dac.txt @@ -0,0 +1,18 @@ +Pistachio internal DAC DT bindings + +Required properties: + + - compatible: "img,pistachio-internal-dac" + + - img,cr-top : Must contain a phandle to the top level control syscon + node which contains the internal dac control registers + + - VDD-supply : Digital power supply regulator (+1.8V or +3.3V) + +Examples: + +internal_dac: internal-dac { + compatible = "img,pistachio-internal-dac"; + img,cr-top = <&cr_top>; + VDD-supply = <&supply3v3>; +}; diff --git a/Documentation/devicetree/bindings/sound/img,spdif-in.txt b/Documentation/devicetree/bindings/sound/img,spdif-in.txt new file mode 100644 index 000000000..aab9a81f7 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/img,spdif-in.txt @@ -0,0 +1,41 @@ +Imagination Technologies SPDIF Input Controller + +Required Properties: + + - compatible : Compatible list, must contain "img,spdif-in" + + - #sound-dai-cells : Must be equal to 0 + + - reg : Offset and length of the register set for the device + + - dmas: Contains an entry for each entry in dma-names. + + - dma-names: Must include the following entry: + "rx" + + - clocks : Contains an entry for each entry in clock-names + + - clock-names : Includes the following entries: + "sys" The system clock + +Optional Properties: + + - resets: Should contain a phandle to the spdif in reset signal, if any + + - reset-names: Should contain the reset signal name "rst", if a + reset phandle is given + + - interrupts : Contains the spdif in interrupt, if present + +Example: + +spdif_in: spdif-in@18100E00 { + compatible = "img,spdif-in"; + reg = <0x18100E00 0x100>; + interrupts = ; + dmas = <&mdc 15 0xffffffff 0>; + dma-names = "rx"; + clocks = <&cr_periph SYS_CLK_SPDIF_IN>; + clock-names = "sys"; + #sound-dai-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/sound/img,spdif-out.txt b/Documentation/devicetree/bindings/sound/img,spdif-out.txt new file mode 100644 index 000000000..470a5191e --- /dev/null +++ b/Documentation/devicetree/bindings/sound/img,spdif-out.txt @@ -0,0 +1,44 @@ +Imagination Technologies SPDIF Output Controller + +Required Properties: + + - compatible : Compatible list, must contain "img,spdif-out" + + - #sound-dai-cells : Must be equal to 0 + + - reg : Offset and length of the register set for the device + + - dmas: Contains an entry for each entry in dma-names. + + - dma-names: Must include the following entry: + "tx" + + - clocks : Contains an entry for each entry in clock-names. + + - clock-names : Includes the following entries: + "sys" The system clock + "ref" The reference clock + + - resets: Contains a phandle to the spdif out reset signal + + - reset-names: Contains the reset signal name "rst" + +Optional Properties: + + - interrupts : Contains the parallel out interrupt, if present + +Example: + +spdif_out: spdif-out@18100D00 { + compatible = "img,spdif-out"; + reg = <0x18100D00 0x100>; + interrupts = ; + dmas = <&mdc 14 0xffffffff 0>; + dma-names = "tx"; + clocks = <&cr_periph SYS_CLK_SPDIF_OUT>, + <&clk_core CLK_SPDIF>; + clock-names = "sys", "ref"; + resets = <&pistachio_reset PISTACHIO_RESET_SPDIF_OUT>; + reset-names = "rst"; + #sound-dai-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/sound/inno-rk3036.txt b/Documentation/devicetree/bindings/sound/inno-rk3036.txt new file mode 100644 index 000000000..758de8e27 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/inno-rk3036.txt @@ -0,0 +1,20 @@ +Inno audio codec for RK3036 + +Inno audio codec is integrated inside RK3036 SoC. + +Required properties: +- compatible : Should be "rockchip,rk3036-codec". +- reg : The registers of codec. +- clock-names : Should be "acodec_pclk". +- clocks : The clock of codec. +- rockchip,grf : The phandle of grf device node. + +Example: + + acodec: acodec-ana@20030000 { + compatible = "rk3036-codec"; + reg = <0x20030000 0x4000>; + rockchip,grf = <&grf>; + clock-names = "acodec_pclk"; + clocks = <&cru ACLK_VCODEC>; + }; diff --git a/Documentation/devicetree/bindings/sound/pcm1792a.txt b/Documentation/devicetree/bindings/sound/pcm1792a.txt deleted file mode 100644 index 970ba1ed5..000000000 --- a/Documentation/devicetree/bindings/sound/pcm1792a.txt +++ /dev/null @@ -1,18 +0,0 @@ -Texas Instruments pcm1792a DT bindings - -This driver supports the SPI bus. - -Required properties: - - - compatible: "ti,pcm1792a" - -For required properties on SPI, please consult -Documentation/devicetree/bindings/spi/spi-bus.txt - -Examples: - - codec_spi: 1792a@0 { - compatible = "ti,pcm1792a"; - spi-max-frequency = <600000>; - }; - diff --git a/Documentation/devicetree/bindings/sound/pcm179x.txt b/Documentation/devicetree/bindings/sound/pcm179x.txt new file mode 100644 index 000000000..4ae70d346 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/pcm179x.txt @@ -0,0 +1,18 @@ +Texas Instruments pcm179x DT bindings + +This driver supports the SPI bus. + +Required properties: + + - compatible: "ti,pcm1792a" + +For required properties on SPI, please consult +Documentation/devicetree/bindings/spi/spi-bus.txt + +Examples: + + codec_spi: 1792a@0 { + compatible = "ti,pcm1792a"; + spi-max-frequency = <600000>; + }; + diff --git a/Documentation/devicetree/bindings/sound/renesas,rsnd.txt b/Documentation/devicetree/bindings/sound/renesas,rsnd.txt index c57cbd657..8ee0fa91e 100644 --- a/Documentation/devicetree/bindings/sound/renesas,rsnd.txt +++ b/Documentation/devicetree/bindings/sound/renesas,rsnd.txt @@ -7,8 +7,11 @@ Required properties: "renesas,rcar_sound-gen3" if generation3 Examples with soctypes are: - "renesas,rcar_sound-r8a7778" (R-Car M1A) + - "renesas,rcar_sound-r8a7779" (R-Car H1) - "renesas,rcar_sound-r8a7790" (R-Car H2) - "renesas,rcar_sound-r8a7791" (R-Car M2-W) + - "renesas,rcar_sound-r8a7793" (R-Car M2-N) + - "renesas,rcar_sound-r8a7794" (R-Car E2) - "renesas,rcar_sound-r8a7795" (R-Car H3) - reg : Should contain the register physical address. required register is @@ -34,6 +37,8 @@ Required properties: see below for detail. - #sound-dai-cells : it must be 0 if your system is using single DAI it must be 1 if your system is using multi DAI + +Optional properties: - #clock-cells : it must be 0 if your system has audio_clkout it must be 1 if your system has audio_clkout0/1/2/3 - clock-frequency : for all audio_clkout0/1/2/3 @@ -244,3 +249,80 @@ rcar_sound: sound@ec500000 { }; }; }; + +Example: simple sound card + + rsnd_ak4643: sound { + compatible = "simple-audio-card"; + + simple-audio-card,format = "left_j"; + simple-audio-card,bitclock-master = <&sndcodec>; + simple-audio-card,frame-master = <&sndcodec>; + + sndcpu: simple-audio-card,cpu { + sound-dai = <&rcar_sound>; + }; + + sndcodec: simple-audio-card,codec { + sound-dai = <&ak4643>; + clocks = <&audio_clock>; + }; + }; + +&rcar_sound { + pinctrl-0 = <&sound_pins &sound_clk_pins>; + pinctrl-names = "default"; + + /* Single DAI */ + #sound-dai-cells = <0>; + + status = "okay"; + + rcar_sound,dai { + dai0 { + playback = <&ssi0 &src2 &dvc0>; + capture = <&ssi1 &src3 &dvc1>; + }; + }; +}; + +&ssi1 { + shared-pin; +}; + +Example: simple sound card for TDM + + rsnd_tdm: sound { + compatible = "simple-audio-card"; + + simple-audio-card,format = "left_j"; + simple-audio-card,bitclock-master = <&sndcodec>; + simple-audio-card,frame-master = <&sndcodec>; + + sndcpu: simple-audio-card,cpu { + sound-dai = <&rcar_sound>; + dai-tdm-slot-num = <6>; + }; + + sndcodec: simple-audio-card,codec { + sound-dai = <&xxx>; + }; + }; + +Example: simple sound card for Multi channel + +&rcar_sound { + pinctrl-0 = <&sound_pins &sound_clk_pins>; + pinctrl-names = "default"; + + /* Single DAI */ + #sound-dai-cells = <0>; + + status = "okay"; + + rcar_sound,dai { + dai0 { + playback = <&ssi0 &ssi1 &ssi2 &src0 &dvc0>; + }; + }; +}; diff --git a/Documentation/devicetree/bindings/sound/renesas,rsrc-card.txt b/Documentation/devicetree/bindings/sound/renesas,rsrc-card.txt index 962748a8d..2b2caa281 100644 --- a/Documentation/devicetree/bindings/sound/renesas,rsrc-card.txt +++ b/Documentation/devicetree/bindings/sound/renesas,rsrc-card.txt @@ -4,8 +4,8 @@ Renesas Sampling Rate Convert Sound Card specifies audio DAI connections of SoC Required properties: -- compatible : "renesas,rsrc-card," - Examples with soctypes are: +- compatible : "renesas,rsrc-card{,}" + Examples with boards are: - "renesas,rsrc-card" - "renesas,rsrc-card,lager" - "renesas,rsrc-card,koelsch" diff --git a/Documentation/devicetree/bindings/sound/rockchip-i2s.txt b/Documentation/devicetree/bindings/sound/rockchip-i2s.txt index 2267d249c..b7f3a9325 100644 --- a/Documentation/devicetree/bindings/sound/rockchip-i2s.txt +++ b/Documentation/devicetree/bindings/sound/rockchip-i2s.txt @@ -19,6 +19,7 @@ Required properties: - clock-names: should contain followings: - "i2s_hclk": clock for I2S BUS - "i2s_clk" : clock for I2S controller +- rockchip,playback-channels: max playback channels, if not set, 8 channels default. - rockchip,capture-channels: max capture channels, if not set, 2 channels default. Example for rk3288 I2S controller: @@ -31,5 +32,6 @@ i2s@ff890000 { dma-names = "tx", "rx"; clock-names = "i2s_hclk", "i2s_clk"; clocks = <&cru HCLK_I2S0>, <&cru SCLK_I2S0>; + rockchip,playback-channels = <8>; rockchip,capture-channels = <2>; }; diff --git a/Documentation/devicetree/bindings/sound/rt5616.txt b/Documentation/devicetree/bindings/sound/rt5616.txt new file mode 100644 index 000000000..efc48c651 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/rt5616.txt @@ -0,0 +1,26 @@ +RT5616 audio CODEC + +This device supports I2C only. + +Required properties: + +- compatible : "realtek,rt5616". + +- reg : The I2C address of the device. + +Pins on the device (for linking into audio routes) for RT5616: + + * IN1P + * IN2P + * IN2N + * LOUTL + * LOUTR + * HPOL + * HPOR + +Example: + +codec: rt5616@1b { + compatible = "realtek,rt5616"; + reg = <0x1b>; +}; diff --git a/Documentation/devicetree/bindings/sound/rt5651.txt b/Documentation/devicetree/bindings/sound/rt5651.txt new file mode 100644 index 000000000..387523309 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/rt5651.txt @@ -0,0 +1,41 @@ +RT5651 audio CODEC + +This device supports I2C only. + +Required properties: + +- compatible : "realtek,rt5651". + +- reg : The I2C address of the device. + +Optional properties: + +- realtek,in2-differential + Boolean. Indicate MIC2 input are differential, rather than single-ended. + +- realtek,dmic-en + Boolean. true if dmic is used. + +Pins on the device (for linking into audio routes) for RT5651: + + * DMIC L1 + * DMIC R1 + * IN1P + * IN2P + * IN2N + * IN3P + * HPOL + * HPOR + * LOUTL + * LOUTR + * PDML + * PDMR + +Example: + +codec: rt5651@1a { + compatible = "realtek,rt5651"; + reg = <0x1a>; + realtek,dmic-en = "true"; + realtek,in2-diff = "false"; +}; diff --git a/Documentation/devicetree/bindings/sound/rt5659.txt b/Documentation/devicetree/bindings/sound/rt5659.txt new file mode 100644 index 000000000..5f79e7fde --- /dev/null +++ b/Documentation/devicetree/bindings/sound/rt5659.txt @@ -0,0 +1,75 @@ +RT5659/RT5658 audio CODEC + +This device supports I2C only. + +Required properties: + +- compatible : One of "realtek,rt5659" or "realtek,rt5658". + +- reg : The I2C address of the device. + +- interrupts : The CODEC's interrupt output. + +Optional properties: + +- realtek,in1-differential +- realtek,in3-differential +- realtek,in4-differential + Boolean. Indicate MIC1/3/4 input are differential, rather than single-ended. + +- realtek,dmic1-data-pin + 0: dmic1 is not used + 1: using IN2N pin as dmic1 data pin + 2: using GPIO5 pin as dmic1 data pin + 3: using GPIO9 pin as dmic1 data pin + 4: using GPIO11 pin as dmic1 data pin + +- realtek,dmic2-data-pin + 0: dmic2 is not used + 1: using IN2P pin as dmic2 data pin + 2: using GPIO6 pin as dmic2 data pin + 3: using GPIO10 pin as dmic2 data pin + 4: using GPIO12 pin as dmic2 data pin + +- realtek,jd-src + 0: No JD is used + 1: using JD3 as JD source + +- realtek,ldo1-en-gpios : The GPIO that controls the CODEC's LDO1_EN pin. +- realtek,reset-gpios : The GPIO that controls the CODEC's RESET pin. + +Pins on the device (for linking into audio routes) for RT5659/RT5658: + + * DMIC L1 + * DMIC R1 + * DMIC L2 + * DMIC R2 + * IN1P + * IN1N + * IN2P + * IN2N + * IN3P + * IN3N + * IN4P + * IN4N + * HPOL + * HPOR + * SPOL + * SPOR + * LOUTL + * LOUTR + * MONOOUT + * PDML + * PDMR + * SPDIF + +Example: + +rt5659 { + compatible = "realtek,rt5659"; + reg = <0x1b>; + interrupt-parent = <&gpio>; + interrupts = ; + realtek,ldo1-en-gpios = + <&gpio TEGRA_GPIO(V, 3) GPIO_ACTIVE_HIGH>; +}; diff --git a/Documentation/devicetree/bindings/sound/rt5677.txt b/Documentation/devicetree/bindings/sound/rt5677.txt index f07078997..1b3c13d20 100644 --- a/Documentation/devicetree/bindings/sound/rt5677.txt +++ b/Documentation/devicetree/bindings/sound/rt5677.txt @@ -18,7 +18,7 @@ Required properties: Optional properties: - realtek,pow-ldo2-gpio : The GPIO that controls the CODEC's POW_LDO2 pin. -- realtek,reset-gpio : The GPIO that controls the CODEC's RESET pin. +- realtek,reset-gpio : The GPIO that controls the CODEC's RESET pin. Active low. - realtek,in1-differential - realtek,in2-differential diff --git a/Documentation/devicetree/bindings/sound/sun4i-codec.txt b/Documentation/devicetree/bindings/sound/sun4i-codec.txt index c92966bd5..0dce690f7 100644 --- a/Documentation/devicetree/bindings/sound/sun4i-codec.txt +++ b/Documentation/devicetree/bindings/sound/sun4i-codec.txt @@ -14,6 +14,9 @@ Required properties: - "apb": the parent APB clock for this controller - "codec": the parent module clock +Optional properties: +- allwinner,pa-gpios: gpio to enable external amplifier + Example: codec: codec@01c22c00 { #sound-dai-cells = <0>; diff --git a/Documentation/devicetree/bindings/sound/ti,pcm3168a.txt b/Documentation/devicetree/bindings/sound/ti,pcm3168a.txt new file mode 100644 index 000000000..5d9cb84c6 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/ti,pcm3168a.txt @@ -0,0 +1,48 @@ +Texas Instruments pcm3168a DT bindings + +This driver supports both SPI and I2C bus access for this codec + +Required properties: + + - compatible: "ti,pcm3168a" + + - clocks : Contains an entry for each entry in clock-names + + - clock-names : Includes the following entries: + "scki" The system clock + + - VDD1-supply : Digital power supply regulator 1 (+3.3V) + + - VDD2-supply : Digital power supply regulator 2 (+3.3V) + + - VCCAD1-supply : ADC power supply regulator 1 (+5V) + + - VCCAD2-supply : ADC power supply regulator 2 (+5V) + + - VCCDA1-supply : DAC power supply regulator 1 (+5V) + + - VCCDA2-supply : DAC power supply regulator 2 (+5V) + +For required properties on SPI/I2C, consult SPI/I2C device tree documentation + +Examples: + +i2c0: i2c0@0 { + + ... + + pcm3168a: audio-codec@44 { + compatible = "ti,pcm3168a"; + reg = <0x44>; + clocks = <&clk_core CLK_AUDIO>; + clock-names = "scki"; + VDD1-supply = <&supply3v3>; + VDD2-supply = <&supply3v3>; + VCCAD1-supply = <&supply5v0>; + VCCAD2-supply = <&supply5v0>; + VCCDA1-supply = <&supply5v0>; + VCCDA2-supply = <&supply5v0>; + pinctrl-names = "default"; + pinctrl-0 = <&dac_clk_pin>; + }; +}; diff --git a/Documentation/devicetree/bindings/sound/wlf,wm8974.txt b/Documentation/devicetree/bindings/sound/wlf,wm8974.txt new file mode 100644 index 000000000..01d3a7c83 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/wlf,wm8974.txt @@ -0,0 +1,15 @@ +WM8974 audio CODEC + +This device supports both I2C and SPI (configured with pin strapping +on the board). + +Required properties: + - compatible: "wlf,wm8974" + - reg: the I2C address or SPI chip select number of the device + +Examples: + +codec: wm8974@1a { + compatible = "wlf,wm8974"; + reg = <0x1a>; +}; diff --git a/Documentation/devicetree/bindings/sound/wm8994.txt b/Documentation/devicetree/bindings/sound/wm8994.txt index e045e90a0..68c4e8d96 100644 --- a/Documentation/devicetree/bindings/sound/wm8994.txt +++ b/Documentation/devicetree/bindings/sound/wm8994.txt @@ -30,7 +30,7 @@ Optional properties: - #interrupt-cells: the number of cells to describe an IRQ, this should be 2. The first cell is the IRQ number. The second cell is the flags, encoded as the trigger masks from - Documentation/devicetree/bindings/interrupts.txt + Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - clocks : A list of up to two phandle and clock specifier pairs - clock-names : A list of clock names sorted in the same order as clocks. diff --git a/Documentation/devicetree/bindings/spi/sh-msiof.txt b/Documentation/devicetree/bindings/spi/sh-msiof.txt index 705075da2..aa005c1d1 100644 --- a/Documentation/devicetree/bindings/spi/sh-msiof.txt +++ b/Documentation/devicetree/bindings/spi/sh-msiof.txt @@ -10,6 +10,7 @@ Required properties: "renesas,msiof-r8a7792" (R-Car V2H) "renesas,msiof-r8a7793" (R-Car M2-N) "renesas,msiof-r8a7794" (R-Car E2) + "renesas,msiof-sh73a0" (SH-Mobile AG5) - reg : A list of offsets and lengths of the register sets for the device. If only one register set is present, it is to be used diff --git a/Documentation/devicetree/bindings/spi/spi-mt65xx.txt b/Documentation/devicetree/bindings/spi/spi-mt65xx.txt index ce363c923..e43f4cf4c 100644 --- a/Documentation/devicetree/bindings/spi/spi-mt65xx.txt +++ b/Documentation/devicetree/bindings/spi/spi-mt65xx.txt @@ -2,9 +2,10 @@ Binding for MTK SPI controller Required properties: - compatible: should be one of the following. - - mediatek,mt8173-spi: for mt8173 platforms - - mediatek,mt8135-spi: for mt8135 platforms + - mediatek,mt2701-spi: for mt2701 platforms - mediatek,mt6589-spi: for mt6589 platforms + - mediatek,mt8135-spi: for mt8135 platforms + - mediatek,mt8173-spi: for mt8173 platforms - #address-cells: should be 1. @@ -29,10 +30,10 @@ Required properties: muxes clock, and "spi-clk" for the clock gate. Optional properties: --cs-gpios: see spi-bus.txt, only required for MT8173. +-cs-gpios: see spi-bus.txt. - mediatek,pad-select: specify which pins group(ck/mi/mo/cs) spi - controller used. This is a array, the element value should be 0~3, + controller used. This is an array, the element value should be 0~3, only required for MT8173. 0: specify GPIO69,70,71,72 for spi pins. 1: specify GPIO102,103,104,105 for spi pins. diff --git a/Documentation/devicetree/bindings/spi/ti_qspi.txt b/Documentation/devicetree/bindings/spi/ti_qspi.txt index 601a36053..cc8304aa6 100644 --- a/Documentation/devicetree/bindings/spi/ti_qspi.txt +++ b/Documentation/devicetree/bindings/spi/ti_qspi.txt @@ -15,14 +15,32 @@ Recommended properties: - spi-max-frequency: Definition as per Documentation/devicetree/bindings/spi/spi-bus.txt +Optional properties: +- syscon-chipselects: Handle to system control region contains QSPI + chipselect register and offset of that register. + Example: +For am4372: qspi: qspi@4b300000 { - compatible = "ti,dra7xxx-qspi"; - reg = <0x47900000 0x100>, <0x30000000 0x3ffffff>; + compatible = "ti,am4372-qspi"; + reg = <0x47900000 0x100>, <0x30000000 0x4000000>; reg-names = "qspi_base", "qspi_mmap"; #address-cells = <1>; #size-cells = <0>; spi-max-frequency = <25000000>; ti,hwmods = "qspi"; }; + +For dra7xx: +qspi: qspi@4b300000 { + compatible = "ti,dra7xxx-qspi"; + reg = <0x4b300000 0x100>, + <0x5c000000 0x4000000>, + reg-names = "qspi_base", "qspi_mmap"; + syscon-chipselects = <&scm_conf 0x558>; + #address-cells = <1>; + #size-cells = <0>; + spi-max-frequency = <48000000>; + ti,hwmods = "qspi"; +}; diff --git a/Documentation/devicetree/bindings/sram/rockchip-pmu-sram.txt b/Documentation/devicetree/bindings/sram/rockchip-pmu-sram.txt new file mode 100644 index 000000000..6b42fda30 --- /dev/null +++ b/Documentation/devicetree/bindings/sram/rockchip-pmu-sram.txt @@ -0,0 +1,16 @@ +Rockchip SRAM for pmu: +------------------------------ + +The sram of pmu is used to store the function of resume from maskrom(the 1st +level loader). This is a common use of the "pmu-sram" because it keeps power +even in low power states in the system. + +Required node properties: +- compatible : should be "rockchip,rk3288-pmu-sram" +- reg : physical base address and the size of the registers window + +Example: + sram@ff720000 { + compatible = "rockchip,rk3288-pmu-sram", "mmio-sram"; + reg = <0xff720000 0x1000>; + }; diff --git a/Documentation/devicetree/bindings/sram/rockchip-smp-sram.txt b/Documentation/devicetree/bindings/sram/rockchip-smp-sram.txt new file mode 100644 index 000000000..800701ecf --- /dev/null +++ b/Documentation/devicetree/bindings/sram/rockchip-smp-sram.txt @@ -0,0 +1,30 @@ +Rockchip SRAM for smp bringup: +------------------------------ + +Rockchip's smp-capable SoCs use the first part of the sram for the bringup +of the cores. Once the core gets powered up it executes the code that is +residing at the very beginning of the sram. + +Therefore a reserved section sub-node has to be added to the mmio-sram +declaration. + +Required sub-node properties: +- compatible : should be "rockchip,rk3066-smp-sram" + +The rest of the properties should follow the generic mmio-sram discription +found in Documentation/devicetree/bindings/sram/sram.txt + +Example: + + sram: sram@10080000 { + compatible = "mmio-sram"; + reg = <0x10080000 0x10000>; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + smp-sram@10080000 { + compatible = "rockchip,rk3066-smp-sram"; + reg = <0x10080000 0x50>; + }; + }; diff --git a/Documentation/devicetree/bindings/sram/samsung-sram.txt b/Documentation/devicetree/bindings/sram/samsung-sram.txt new file mode 100644 index 000000000..6bc474b2b --- /dev/null +++ b/Documentation/devicetree/bindings/sram/samsung-sram.txt @@ -0,0 +1,38 @@ +Samsung Exynos SYSRAM for SMP bringup: +------------------------------------ + +Samsung SMP-capable Exynos SoCs use part of the SYSRAM for the bringup +of the secondary cores. Once the core gets powered up it executes the +code that is residing at some specific location of the SYSRAM. + +Therefore reserved section sub-nodes have to be added to the mmio-sram +declaration. These nodes are of two types depending upon secure or +non-secure execution environment. + +Required sub-node properties: +- compatible : depending upon boot mode, should be + "samsung,exynos4210-sysram" : for Secure SYSRAM + "samsung,exynos4210-sysram-ns" : for Non-secure SYSRAM + +The rest of the properties should follow the generic mmio-sram discription +found in Documentation/devicetree/bindings/sram/sram.txt + +Example: + + sysram@02020000 { + compatible = "mmio-sram"; + reg = <0x02020000 0x54000>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x02020000 0x54000>; + + smp-sysram@0 { + compatible = "samsung,exynos4210-sysram"; + reg = <0x0 0x1000>; + }; + + smp-sysram@53000 { + compatible = "samsung,exynos4210-sysram-ns"; + reg = <0x53000 0x1000>; + }; + }; diff --git a/Documentation/devicetree/bindings/sram/sram.txt b/Documentation/devicetree/bindings/sram/sram.txt new file mode 100644 index 000000000..42ee9438b --- /dev/null +++ b/Documentation/devicetree/bindings/sram/sram.txt @@ -0,0 +1,67 @@ +Generic on-chip SRAM + +Simple IO memory regions to be managed by the genalloc API. + +Required properties: + +- compatible : mmio-sram + +- reg : SRAM iomem address range + +Reserving sram areas: +--------------------- + +Each child of the sram node specifies a region of reserved memory. Each +child node should use a 'reg' property to specify a specific range of +reserved memory. + +Following the generic-names recommended practice, node names should +reflect the purpose of the node. Unit address (@
) should be +appended to the name. + +Required properties in the sram node: + +- #address-cells, #size-cells : should use the same values as the root node +- ranges : standard definition, should translate from local addresses + within the sram to bus addresses + +Required properties in the area nodes: + +- reg : iomem address range, relative to the SRAM range + +Optional properties in the area nodes: + +- compatible : standard definition, should contain a vendor specific string + in the form ,[-] +- pool : indicates that the particular reserved SRAM area is addressable + and in use by another device or devices +- export : indicates that the reserved SRAM area may be accessed outside + of the kernel, e.g. by bootloader or userspace +- label : the name for the reserved partition, if omitted, the label + is taken from the node name excluding the unit address. + +Example: + +sram: sram@5c000000 { + compatible = "mmio-sram"; + reg = <0x5c000000 0x40000>; /* 256 KiB SRAM at address 0x5c000000 */ + + #adress-cells = <1>; + #size-cells = <1>; + ranges = <0 0x5c000000 0x40000>; + + smp-sram@100 { + compatible = "socvendor,smp-sram"; + reg = <0x100 0x50>; + }; + + device-sram@1000 { + reg = <0x1000 0x1000>; + pool; + }; + + exported@20000 { + reg = <0x20000 0x20000>; + export; + }; +}; diff --git a/Documentation/devicetree/bindings/sram/sunxi-sram.txt b/Documentation/devicetree/bindings/sram/sunxi-sram.txt new file mode 100644 index 000000000..8d5665468 --- /dev/null +++ b/Documentation/devicetree/bindings/sram/sunxi-sram.txt @@ -0,0 +1,72 @@ +Allwinnner SoC SRAM controllers +----------------------------------------------------- + +The SRAM controller found on most Allwinner devices is represented by +a regular node for the SRAM controller itself, with sub-nodes +reprensenting the SRAM handled by the SRAM controller. + +Controller Node +--------------- + +Required properties: +- compatible : "allwinner,sun4i-a10-sram-controller" +- reg : sram controller register offset + length + +SRAM nodes +---------- + +Each SRAM is described using the mmio-sram bindings documented in +Documentation/devicetree/bindings/sram/sram.txt + +Each SRAM will have SRAM sections that are going to be handled by the +SRAM controller as subnodes. These sections are represented following +once again the representation described in the mmio-sram binding. + +The valid sections compatible are: + - allwinner,sun4i-a10-sram-a3-a4 + - allwinner,sun4i-a10-sram-d + +Devices using SRAM sections +--------------------------- + +Some devices need to request to the SRAM controller to map an SRAM for +their exclusive use. + +The relationship between such a device and an SRAM section is +expressed through the allwinner,sram property, that will take a +phandle and an argument. + +This valid values for this argument are: + - 0: CPU + - 1: Device + +Example +------- +sram-controller@01c00000 { + compatible = "allwinner,sun4i-a10-sram-controller"; + reg = <0x01c00000 0x30>; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + sram_a: sram@00000000 { + compatible = "mmio-sram"; + reg = <0x00000000 0xc000>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x00000000 0xc000>; + + emac_sram: sram-section@8000 { + compatible = "allwinner,sun4i-a10-sram-a3-a4"; + reg = <0x8000 0x4000>; + status = "disabled"; + }; + }; +}; + +emac: ethernet@01c0b000 { + compatible = "allwinner,sun4i-a10-emac"; + ... + + allwinner,sram = <&emac_sram 1>; +}; diff --git a/Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt b/Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt new file mode 100644 index 000000000..c59e27c63 --- /dev/null +++ b/Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt @@ -0,0 +1,31 @@ +Hi6220 SoC ION +=================================================================== +Required properties: +- compatible : "hisilicon,hi6220-ion" +- list of the ION heaps + - heap name : maybe heap_sys_user@0 + - heap id : id should be unique in the system. + - heap base : base ddr address of the heap,0 means that + it is dynamic. + - heap size : memory size and 0 means it is dynamic. + - heap type : the heap type of the heap, please also + see the define in ion.h(drivers/staging/android/uapi/ion.h) +------------------------------------------------------------------- +Example: + hi6220-ion { + compatible = "hisilicon,hi6220-ion"; + heap_sys_user@0 { + heap-name = "sys_user"; + heap-id = <0x0>; + heap-base = <0x0>; + heap-size = <0x0>; + heap-type = "ion_system"; + }; + heap_sys_contig@0 { + heap-name = "sys_contig"; + heap-id = <0x1>; + heap-base = <0x0>; + heap-size = <0x0>; + heap-type = "ion_system_contig"; + }; + }; diff --git a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt new file mode 100644 index 000000000..66223d561 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt @@ -0,0 +1,63 @@ +* Thermal Monitoring Unit (TMU) on Freescale QorIQ SoCs + +Required properties: +- compatible : Must include "fsl,qoriq-tmu". The version of the device is + determined by the TMU IP Block Revision Register (IPBRR0) at + offset 0x0BF8. + Table of correspondences between IPBRR0 values and example chips: + Value Device + ---------- ----- + 0x01900102 T1040 +- reg : Address range of TMU registers. +- interrupts : Contains the interrupt for TMU. +- fsl,tmu-range : The values to be programmed into TTRnCR, as specified by + the SoC reference manual. The first cell is TTR0CR, the second is + TTR1CR, etc. +- fsl,tmu-calibration : A list of cell pairs containing temperature + calibration data, as specified by the SoC reference manual. + The first cell of each pair is the value to be written to TTCFGR, + and the second is the value to be written to TSCFGR. + +Example: + +tmu@f0000 { + compatible = "fsl,qoriq-tmu"; + reg = <0xf0000 0x1000>; + interrupts = <18 2 0 0>; + fsl,tmu-range = <0x000a0000 0x00090026 0x0008004a 0x0001006a>; + fsl,tmu-calibration = <0x00000000 0x00000025 + 0x00000001 0x00000028 + 0x00000002 0x0000002d + 0x00000003 0x00000031 + 0x00000004 0x00000036 + 0x00000005 0x0000003a + 0x00000006 0x00000040 + 0x00000007 0x00000044 + 0x00000008 0x0000004a + 0x00000009 0x0000004f + 0x0000000a 0x00000054 + + 0x00010000 0x0000000d + 0x00010001 0x00000013 + 0x00010002 0x00000019 + 0x00010003 0x0000001f + 0x00010004 0x00000025 + 0x00010005 0x0000002d + 0x00010006 0x00000033 + 0x00010007 0x00000043 + 0x00010008 0x0000004b + 0x00010009 0x00000053 + + 0x00020000 0x00000010 + 0x00020001 0x00000017 + 0x00020002 0x0000001f + 0x00020003 0x00000029 + 0x00020004 0x00000031 + 0x00020005 0x0000003c + 0x00020006 0x00000042 + 0x00020007 0x0000004d + 0x00020008 0x00000056 + + 0x00030000 0x00000012 + 0x00030001 0x0000001d>; +}; diff --git a/Documentation/devicetree/bindings/thermal/rcar-thermal.txt b/Documentation/devicetree/bindings/thermal/rcar-thermal.txt index 332e625f6..e5ee3f159 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/rcar-thermal.txt @@ -1,8 +1,9 @@ * Renesas R-Car Thermal Required properties: -- compatible : "renesas,thermal-", "renesas,rcar-thermal" - as fallback. +- compatible : "renesas,thermal-", + "renesas,rcar-gen2-thermal" (with thermal-zone) or + "renesas,rcar-thermal" (without thermal-zone) as fallback. Examples with soctypes are: - "renesas,thermal-r8a73a4" (R-Mobile APE6) - "renesas,thermal-r8a7779" (R-Car H1) @@ -36,3 +37,35 @@ thermal@e61f0000 { 0xe61f0300 0x38>; interrupts = <0 69 IRQ_TYPE_LEVEL_HIGH>; }; + +Example (with thermal-zone): + +thermal-zones { + cpu_thermal: cpu-thermal { + polling-delay-passive = <1000>; + polling-delay = <5000>; + + thermal-sensors = <&thermal>; + + trips { + cpu-crit { + temperature = <115000>; + hysteresis = <0>; + type = "critical"; + }; + }; + cooling-maps { + }; + }; +}; + +thermal: thermal@e61f0000 { + compatible = "renesas,thermal-r8a7790", + "renesas,rcar-gen2-thermal", + "renesas,rcar-thermal"; + reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>; + interrupts = <0 69 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&mstp5_clks R8A7790_CLK_THERMAL>; + power-domains = <&cpg_clocks>; + #thermal-sensor-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt index 0dfa60d88..08efe6bc2 100644 --- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt @@ -2,8 +2,10 @@ Required properties: - compatible : should be "rockchip,-tsadc" + "rockchip,rk3228-tsadc": found on RK3228 SoCs "rockchip,rk3288-tsadc": found on RK3288 SoCs "rockchip,rk3368-tsadc": found on RK3368 SoCs + "rockchip,rk3399-tsadc": found on RK3399 SoCs - reg : physical base address of the controller and length of memory mapped region. - interrupts : The interrupt number to the cpu. The interrupt specifier format diff --git a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt index 64083bc56..8ff54eb46 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt +++ b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt @@ -3,6 +3,7 @@ Mediatek MT6577, MT6572 and MT6589 Timers Required properties: - compatible should contain: + * "mediatek,mt2701-timer" for MT2701 compatible timers * "mediatek,mt6580-timer" for MT6580 compatible timers * "mediatek,mt6589-timer" for MT6589 compatible timers * "mediatek,mt8127-timer" for MT8127 compatible timers diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt index fd132cbee..221368207 100644 --- a/Documentation/devicetree/bindings/usb/dwc2.txt +++ b/Documentation/devicetree/bindings/usb/dwc2.txt @@ -4,6 +4,7 @@ Platform DesignWare HS OTG USB 2.0 controller Required properties: - compatible : One of: - brcm,bcm2835-usb: The DWC2 USB controller instance in the BCM2835 SoC. + - hisilicon,hi6220-usb: The DWC2 USB controller instance in the hi6220 SoC. - rockchip,rk3066-usb: The DWC2 USB controller instance in the rk3066 Soc; - "rockchip,rk3188-usb", "rockchip,rk3066-usb", "snps,dwc2": for rk3188 Soc; - "rockchip,rk3288-usb", "rockchip,rk3066-usb", "snps,dwc2": for rk3288 Soc; diff --git a/Documentation/devicetree/bindings/usb/dwc3-xilinx.txt b/Documentation/devicetree/bindings/usb/dwc3-xilinx.txt new file mode 100644 index 000000000..30361b32a --- /dev/null +++ b/Documentation/devicetree/bindings/usb/dwc3-xilinx.txt @@ -0,0 +1,33 @@ +Xilinx SuperSpeed DWC3 USB SoC controller + +Required properties: +- compatible: Should contain "xlnx,zynqmp-dwc3" +- clocks: A list of phandles for the clocks listed in clock-names +- clock-names: Should contain the following: + "bus_clk" Master/Core clock, have to be >= 125 MHz for SS + operation and >= 60MHz for HS operation + + "ref_clk" Clock source to core during PHY power down + +Required child node: +A child node must exist to represent the core DWC3 IP block. The name of +the node is not important. The content of the node is defined in dwc3.txt. + +Example device node: + + usb@0 { + #address-cells = <0x2>; + #size-cells = <0x1>; + status = "okay"; + compatible = "xlnx,zynqmp-dwc3"; + clock-names = "bus_clk" "ref_clk"; + clocks = <&clk125>, <&clk125>; + ranges; + + dwc3@fe200000 { + compatible = "snps,dwc3"; + reg = <0x0 0xfe200000 0x40000>; + interrupts = <0x0 0x41 0x4>; + dr_mode = "host"; + }; + }; diff --git a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt b/Documentation/devicetree/bindings/usb/mt8173-xhci.txt new file mode 100644 index 000000000..b3a7ffa48 --- /dev/null +++ b/Documentation/devicetree/bindings/usb/mt8173-xhci.txt @@ -0,0 +1,51 @@ +MT8173 xHCI + +The device node for Mediatek SOC USB3.0 host controller + +Required properties: + - compatible : should contain "mediatek,mt8173-xhci" + - reg : specifies physical base address and size of the registers, + the first one for MAC, the second for IPPC + - interrupts : interrupt used by the controller + - power-domains : a phandle to USB power domain node to control USB's + mtcmos + - vusb33-supply : regulator of USB avdd3.3v + + - clocks : a list of phandle + clock-specifier pairs, one for each + entry in clock-names + - clock-names : must contain + "sys_ck": for clock of xHCI MAC + "wakeup_deb_p0": for USB wakeup debounce clock of port0 + "wakeup_deb_p1": for USB wakeup debounce clock of port1 + + - phys : a list of phandle + phy specifier pairs + +Optional properties: + - mediatek,wakeup-src : 1: ip sleep wakeup mode; 2: line state wakeup + mode; + - mediatek,syscon-wakeup : phandle to syscon used to access USB wakeup + control register, it depends on "mediatek,wakeup-src". + - vbus-supply : reference to the VBUS regulator; + - usb3-lpm-capable : supports USB3.0 LPM + +Example: +usb30: usb@11270000 { + compatible = "mediatek,mt8173-xhci"; + reg = <0 0x11270000 0 0x1000>, + <0 0x11280700 0 0x0100>; + interrupts = ; + power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; + clocks = <&topckgen CLK_TOP_USB30_SEL>, + <&pericfg CLK_PERI_USB0>, + <&pericfg CLK_PERI_USB1>; + clock-names = "sys_ck", + "wakeup_deb_p0", + "wakeup_deb_p1"; + phys = <&phy_port0 PHY_TYPE_USB3>, + <&phy_port1 PHY_TYPE_USB2>; + vusb33-supply = <&mt6397_vusb_reg>; + vbus-supply = <&usb_p1_vbus>; + usb3-lpm-capable; + mediatek,syscon-wakeup = <&pericfg>; + mediatek,wakeup-src = <1>; +}; diff --git a/Documentation/devicetree/bindings/usb/octeon-usb.txt b/Documentation/devicetree/bindings/usb/octeon-usb.txt new file mode 100644 index 000000000..205c8d24d --- /dev/null +++ b/Documentation/devicetree/bindings/usb/octeon-usb.txt @@ -0,0 +1,62 @@ +OCTEON/OCTEON+ USB BLOCK + +1) Main node + + Required properties: + + - compatible: must be "cavium,octeon-5750-usbn" + + - reg: specifies the physical base address of the USBN block and + the length of the memory mapped region. + + - #address-cells: specifies the number of cells needed to encode an + address. The value must be 2. + + - #size-cells: specifies the number of cells used to represent the size + of an address. The value must be 2. + + - ranges: specifies the translation between child address space and parent + address space. + + - clock-frequency: speed of the USB reference clock. Allowed values are + 12000000, 24000000 or 48000000. + + - cavium,refclk-type: type of the USB reference clock. Allowed values are + "crystal" or "external". + + - refclk-frequency: deprecated, use "clock-frequency". + + - refclk-type: deprecated, use "cavium,refclk-type". + +2) Child node + + The main node must have one child node which describes the built-in + USB controller. + + Required properties: + + - compatible: must be "cavium,octeon-5750-usbc" + + - reg: specifies the physical base address of the USBC block and + the length of the memory mapped region. + + - interrupts: specifies the interrupt number for the USB controller. + +3) Example: + + usbn: usbn@1180068000000 { + compatible = "cavium,octeon-5750-usbn"; + reg = <0x11800 0x68000000 0x0 0x1000>; + ranges; /* Direct mapping */ + #address-cells = <2>; + #size-cells = <2>; + clock-frequency = <12000000>; + cavium,refclk-type = "crystal"; + + usbc@16f0010000000 { + compatible = "cavium,octeon-5750-usbc"; + reg = <0x16f00 0x10000000 0x0 0x80000>; + interrupts = <0 56>; + }; + }; + diff --git a/Documentation/devicetree/bindings/usb/renesas_usb3.txt b/Documentation/devicetree/bindings/usb/renesas_usb3.txt new file mode 100644 index 000000000..8d52766f0 --- /dev/null +++ b/Documentation/devicetree/bindings/usb/renesas_usb3.txt @@ -0,0 +1,23 @@ +Renesas Electronics USB3.0 Peripheral driver + +Required properties: + - compatible: Must contain one of the following: + - "renesas,r8a7795-usb3-peri" + - reg: Base address and length of the register for the USB3.0 Peripheral + - interrupts: Interrupt specifier for the USB3.0 Peripheral + - clocks: clock phandle and specifier pair + +Example: + usb3_peri0: usb@ee020000 { + compatible = "renesas,r8a7795-usb3-peri"; + reg = <0 0xee020000 0 0x400>; + interrupts = ; + clocks = <&cpg CPG_MOD 328>; + }; + + usb3_peri1: usb@ee060000 { + compatible = "renesas,r8a7795-usb3-peri"; + reg = <0 0xee060000 0 0x400>; + interrupts = ; + clocks = <&cpg CPG_MOD 327>; + }; diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt index 7d48f63db..b6040563e 100644 --- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt +++ b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt @@ -1,11 +1,21 @@ Renesas Electronics USBHS driver Required properties: - - compatible: Must contain one of the following: - - "renesas,usbhs-r8a7790" - - "renesas,usbhs-r8a7791" - - "renesas,usbhs-r8a7794" - - "renesas,usbhs-r8a7795" + - compatible: Must contain one or more of the following: + + - "renesas,usbhs-r8a7790" for r8a7790 (R-Car H2) compatible device + - "renesas,usbhs-r8a7791" for r8a7791 (R-Car M2-W) compatible device + - "renesas,usbhs-r8a7792" for r8a7792 (R-Car V2H) compatible device + - "renesas,usbhs-r8a7793" for r8a7793 (R-Car M2-N) compatible device + - "renesas,usbhs-r8a7794" for r8a7794 (R-Car E2) compatible device + - "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device + - "renesas,rcar-gen2-usbhs" for R-Car Gen2 compatible device + - "renesas,rcar-gen3-usbhs" for R-Car Gen3 compatible device + + When compatible with the generic version, nodes must list the + SoC-specific version corresponding to the platform first followed + by the generic version. + - reg: Base address and length of the register for the USBHS - interrupts: Interrupt specifier for the USBHS - clocks: A list of phandle + clock specifier pairs @@ -22,7 +32,7 @@ Optional properties: Example: usbhs: usb@e6590000 { - compatible = "renesas,usbhs-r8a7790"; + compatible = "renesas,usbhs-r8a7790", "renesas,rcar-gen2-usbhs"; reg = <0 0xe6590000 0 0x100>; interrupts = <0 107 IRQ_TYPE_LEVEL_HIGH>; clocks = <&mstp7_clks R8A7790_CLK_HSUSB>; diff --git a/Documentation/devicetree/bindings/usb/usb-xhci.txt b/Documentation/devicetree/bindings/usb/usb-xhci.txt index 86f67f088..082573289 100644 --- a/Documentation/devicetree/bindings/usb/usb-xhci.txt +++ b/Documentation/devicetree/bindings/usb/usb-xhci.txt @@ -3,8 +3,8 @@ USB xHCI controllers Required properties: - compatible: should be one of "generic-xhci", "marvell,armada-375-xhci", "marvell,armada-380-xhci", - "renesas,xhci-r8a7790", "renesas,xhci-r8a7791" (deprecated: - "xhci-platform"). + "renesas,xhci-r8a7790", "renesas,xhci-r8a7791", "renesas,xhci-r8a7793", + "renesas,xhci-r8a7795" (deprecated: "xhci-platform"). - reg: should contain address and length of the standard XHCI register set for the device. - interrupts: one XHCI interrupt should be described here. diff --git a/Documentation/devicetree/bindings/usb/usb3503.txt b/Documentation/devicetree/bindings/usb/usb3503.txt index 52493b148..c1a0a9191 100644 --- a/Documentation/devicetree/bindings/usb/usb3503.txt +++ b/Documentation/devicetree/bindings/usb/usb3503.txt @@ -18,7 +18,8 @@ Optional properties: - refclk: Clock used for driving REFCLK signal (optional, if not provided the driver assumes that clock signal is always available, its rate is specified by REF_SEL pins and a value from the primary - reference clock frequencies table is used) + reference clock frequencies table is used). Use clocks and + clock-names in order to assign it - refclk-frequency: Frequency of the REFCLK signal as defined by REF_SEL pins (optional, if not provided, driver will not set rate of the REFCLK signal and assume that a value from the primary reference @@ -33,4 +34,6 @@ Examples: intn-gpios = <&gpx3 4 1>; reset-gpios = <&gpx3 5 1>; initial-mode = <1>; + clocks = <&clks 80>; + clock-names = "refclk"; }; diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 55df1d444..72e2c5a2b 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -33,6 +33,7 @@ auo AU Optronics Corporation avago Avago Technologies avic Shanghai AVIC Optoelectronics Co., Ltd. axis Axis Communications AB +boe BOE Technology Group Co., Ltd. bosch Bosch Sensortec GmbH boundary Boundary Devices Inc. brcm Broadcom Corporation @@ -123,6 +124,8 @@ jedec JEDEC Solid State Technology Association karo Ka-Ro electronics GmbH keymile Keymile GmbH kinetic Kinetic Technologies +kosagi Sutajio Ko-Usagi PTE Ltd. +kyo Kyocera Corporation lacie LaCie lantiq Lantiq Semiconductor lenovo Lenovo Group Ltd. @@ -161,6 +164,7 @@ nuvoton Nuvoton Technology Corporation nvidia NVIDIA nxp NXP Semiconductors okaya Okaya Electric America, Inc. +olimex OLIMEX Ltd. onnn ON Semiconductor Corp. opencores OpenCores.org option Option NV @@ -180,6 +184,7 @@ qca Qualcomm Atheros, Inc. qcom Qualcomm Technologies, Inc qemu QEMU, a generic and open source machine emulator and virtualizer qi Qi Hardware +qiaodian QiaoDian XianShi Corporation qnap QNAP Systems, Inc. radxa Radxa raidsonic RaidSonic Technology GmbH @@ -218,11 +223,13 @@ sony Sony Corporation spansion Spansion Inc. sprd Spreadtrum Communications Inc. st STMicroelectronics +startek Startek ste ST-Ericsson stericsson ST-Ericsson synology Synology, Inc. tbs TBS Technologies tcl Toby Churchill Ltd. +technologic Technologic Systems thine THine Electronics, Inc. ti Texas Instruments tlm Trusted Logic Mobility @@ -238,6 +245,7 @@ v3 V3 Semiconductor variscite Variscite Ltd. via VIA Technologies, Inc. virtio Virtual I/O Device Specification, developed by the OASIS consortium +vivante Vivante Corporation voipac Voipac Technologies s.r.o. wexler Wexler winbond Winbond Electronics corp. diff --git a/Documentation/devicetree/bindings/watchdog/alphascale-asm9260.txt b/Documentation/devicetree/bindings/watchdog/alphascale-asm9260.txt new file mode 100644 index 000000000..75b265a04 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/alphascale-asm9260.txt @@ -0,0 +1,35 @@ +Alphascale asm9260 Watchdog timer + +Required properties: + +- compatible : should be "alphascale,asm9260-wdt". +- reg : Specifies base physical address and size of the registers. +- clocks : the clocks feeding the watchdog timer. See clock-bindings.txt +- clock-names : should be set to + "mod" - source for tick counter. + "ahb" - ahb gate. +- resets : phandle pointing to the system reset controller with + line index for the watchdog. +- reset-names : should be set to "wdt_rst". + +Optional properties: +- timeout-sec : shall contain the default watchdog timeout in seconds, + if unset, the default timeout is 30 seconds. +- alphascale,mode : three modes are supported + "hw" - hw reset (default). + "sw" - sw reset. + "debug" - no action is taken. + +Example: + +watchdog0: watchdog@80048000 { + compatible = "alphascale,asm9260-wdt"; + reg = <0x80048000 0x10>; + clocks = <&acc CLKID_SYS_WDT>, <&acc CLKID_AHB_WDT>; + clock-names = "mod", "ahb"; + interrupts = <55>; + resets = <&rst WDT_RESET>; + reset-names = "wdt_rst"; + timeout-sec = <30>; + alphascale,mode = "hw"; +}; diff --git a/Documentation/devicetree/bindings/watchdog/meson-wdt.txt b/Documentation/devicetree/bindings/watchdog/meson-wdt.txt new file mode 100644 index 000000000..ae70185d9 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/meson-wdt.txt @@ -0,0 +1,13 @@ +Meson SoCs Watchdog timer + +Required properties: + +- compatible : should be "amlogic,meson6-wdt" or "amlogic,meson8b-wdt" +- reg : Specifies base physical address and size of the registers. + +Example: + +wdt: watchdog@c1109900 { + compatible = "amlogic,meson6-wdt"; + reg = <0xc1109900 0x8>; +}; diff --git a/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt deleted file mode 100644 index 9200fc2d5..000000000 --- a/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt +++ /dev/null @@ -1,13 +0,0 @@ -Meson SoCs Watchdog timer - -Required properties: - -- compatible : should be "amlogic,meson6-wdt" -- reg : Specifies base physical address and size of the registers. - -Example: - -wdt: watchdog@c1109900 { - compatible = "amlogic,meson6-wdt"; - reg = <0xc1109900 0x8>; -}; diff --git a/Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt b/Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt new file mode 100644 index 000000000..c15ef0ef6 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt @@ -0,0 +1,12 @@ +Ralink Watchdog Timers + +Required properties: +- compatible: must be "mediatek,mt7621-wdt" +- reg: physical base address of the controller and length of the register range + +Example: + + watchdog@100 { + compatible = "mediatek,mt7621-wdt"; + reg = <0x100 0x10>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt index af9eb5b8a..6a00939a0 100644 --- a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt @@ -2,7 +2,11 @@ Mediatek SoCs Watchdog timer Required properties: -- compatible : should be "mediatek,mt6589-wdt" +- compatible should contain: + * "mediatek,mt2701-wdt" for MT2701 compatible watchdog timers + * "mediatek,mt6589-wdt" for all compatible watchdog timers (MT2701, + MT6589) + - reg : Specifies base physical address and size of the registers. Example: diff --git a/Documentation/devicetree/bindings/watchdog/sigma,smp8642-wdt.txt b/Documentation/devicetree/bindings/watchdog/sigma,smp8642-wdt.txt new file mode 100644 index 000000000..5b7ec2c70 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/sigma,smp8642-wdt.txt @@ -0,0 +1,18 @@ +Sigma Designs SMP86xx/SMP87xx watchdog + +Required properties: +- compatible: Should be "sigma,smp8642-wdt" +- reg: Specifies the physical address region +- clocks: Should be a phandle to the clock + +Optional properties: +- timeout-sec: watchdog timeout in seconds + +Example: + +watchdog@1fd00 { + compatible = "sigma,smp8642-wdt"; + reg = <0x1fd00 8>; + clocks = <&xtal_in_clk>; + timeout-sec = <30>; +}; diff --git a/Documentation/devicetree/bindings/watchdog/sp805-wdt.txt b/Documentation/devicetree/bindings/watchdog/sp805-wdt.txt new file mode 100644 index 000000000..edc4f0ea5 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/sp805-wdt.txt @@ -0,0 +1,31 @@ +* ARM SP805 Watchdog Timer (WDT) Controller + +SP805 WDT is a ARM Primecell Peripheral and has a standard-id register that +can be used to identify the peripheral type, vendor, and revision. +This value can be used for driver matching. + +As SP805 WDT is a primecell IP, it follows the base bindings specified in +'arm/primecell.txt' + +Required properties: +- compatible : Should be "arm,sp805-wdt", "arm,primecell" +- reg : Base address and size of the watchdog timer registers. +- clocks : From common clock binding. + First clock is PCLK and the second is WDOGCLK. + WDOGCLK can be equal to or be a sub-multiple of the PCLK frequency. +- clock-names : From common clock binding. + Shall be "apb_pclk" for first clock and "wdog_clk" for the + second one. + +Optional properties: +- interrupts : Should specify WDT interrupt number. + +Examples: + + cluster1_core0_watchdog: wdt@c000000 { + compatible = "arm,sp805-wdt", "arm,primecell"; + reg = <0x0 0xc000000 0x0 0x1000>; + clocks = <&clockgen 4 3>, <&clockgen 4 3>; + clock-names = "apb_pclk", "wdog_clk"; + }; + diff --git a/Documentation/devicetree/bindings/watchdog/ts4800-wdt.txt b/Documentation/devicetree/bindings/watchdog/ts4800-wdt.txt new file mode 100644 index 000000000..8f6caad42 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/ts4800-wdt.txt @@ -0,0 +1,25 @@ +Technologic Systems Watchdog + +Required properties: +- compatible: must be "technologic,ts4800-wdt" +- syscon: phandle / integer array that points to the syscon node which + describes the FPGA's syscon registers. + - phandle to FPGA's syscon + - offset to the watchdog register + +Optional property: +- timeout-sec: contains the watchdog timeout in seconds. + +Example: + +syscon: syscon@b0010000 { + compatible = "syscon", "simple-mfd"; + reg = <0xb0010000 0x3d>; + reg-io-width = <2>; + + wdt@e { + compatible = "technologic,ts4800-wdt"; + syscon = <&syscon 0xe>; + timeout-sec = <10>; + }; +} diff --git a/Documentation/devicetree/bindings/watchdog/ziirave-wdt.txt b/Documentation/devicetree/bindings/watchdog/ziirave-wdt.txt new file mode 100644 index 000000000..3d878184e --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/ziirave-wdt.txt @@ -0,0 +1,19 @@ +Zodiac RAVE Watchdog Timer + +Required properties: +- compatible: must be "zii,rave-wdt" +- reg: i2c slave address of device, usually 0x38 + +Optional Properties: +- timeout-sec: Watchdog timeout value in seconds. +- reset-duration-ms: Duration of the pulse generated when the watchdog times + out. Value in milliseconds. + +Example: + + watchdog@38 { + compatible = "zii,rave-wdt"; + reg = <0x38>; + timeout-sec = <30>; + reset-duration-ms = <30>; + }; diff --git a/Documentation/dmaengine/client.txt b/Documentation/dmaengine/client.txt index 11fb87ff6..9e3318974 100644 --- a/Documentation/dmaengine/client.txt +++ b/Documentation/dmaengine/client.txt @@ -22,25 +22,14 @@ The slave DMA usage consists of following steps: Channel allocation is slightly different in the slave DMA context, client drivers typically need a channel from a particular DMA controller only and even in some cases a specific channel is desired. - To request a channel dma_request_channel() API is used. + To request a channel dma_request_chan() API is used. Interface: - struct dma_chan *dma_request_channel(dma_cap_mask_t mask, - dma_filter_fn filter_fn, - void *filter_param); - where dma_filter_fn is defined as: - typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); + struct dma_chan *dma_request_chan(struct device *dev, const char *name); - The 'filter_fn' parameter is optional, but highly recommended for - slave and cyclic channels as they typically need to obtain a specific - DMA channel. - - When the optional 'filter_fn' parameter is NULL, dma_request_channel() - simply returns the first channel that satisfies the capability mask. - - Otherwise, the 'filter_fn' routine will be called once for each free - channel which has a capability in 'mask'. 'filter_fn' is expected to - return 'true' when the desired DMA channel is found. + Which will find and return the 'name' DMA channel associated with the 'dev' + device. The association is done via DT, ACPI or board file based + dma_slave_map matching table. A channel allocated via this interface is exclusive to the caller, until dma_release_channel() is called. @@ -128,7 +117,7 @@ The slave DMA usage consists of following steps: transaction. For cyclic DMA, a callback function may wish to terminate the - DMA via dmaengine_terminate_all(). + DMA via dmaengine_terminate_async(). Therefore, it is important that DMA engine drivers drop any locks before calling the callback function which may cause a @@ -166,12 +155,29 @@ The slave DMA usage consists of following steps: Further APIs: -1. int dmaengine_terminate_all(struct dma_chan *chan) +1. int dmaengine_terminate_sync(struct dma_chan *chan) + int dmaengine_terminate_async(struct dma_chan *chan) + int dmaengine_terminate_all(struct dma_chan *chan) /* DEPRECATED */ This causes all activity for the DMA channel to be stopped, and may discard data in the DMA FIFO which hasn't been fully transferred. No callback functions will be called for any incomplete transfers. + Two variants of this function are available. + + dmaengine_terminate_async() might not wait until the DMA has been fully + stopped or until any running complete callbacks have finished. But it is + possible to call dmaengine_terminate_async() from atomic context or from + within a complete callback. dmaengine_synchronize() must be called before it + is safe to free the memory accessed by the DMA transfer or free resources + accessed from within the complete callback. + + dmaengine_terminate_sync() will wait for the transfer and any running + complete callbacks to finish before it returns. But the function must not be + called from atomic context or from within a complete callback. + + dmaengine_terminate_all() is deprecated and should not be used in new code. + 2. int dmaengine_pause(struct dma_chan *chan) This pauses activity on the DMA channel without data loss. @@ -197,3 +203,20 @@ Further APIs: a running DMA channel. It is recommended that DMA engine users pause or stop (via dmaengine_terminate_all()) the channel before using this API. + +5. void dmaengine_synchronize(struct dma_chan *chan) + + Synchronize the termination of the DMA channel to the current context. + + This function should be used after dmaengine_terminate_async() to synchronize + the termination of the DMA channel to the current context. The function will + wait for the transfer and any running complete callbacks to finish before it + returns. + + If dmaengine_terminate_async() is used to stop the DMA channel this function + must be called before it is safe to free memory accessed by previously + submitted descriptors or to free any resources accessed within the complete + callback of previously submitted descriptors. + + The behavior of this function is undefined if dma_async_issue_pending() has + been called between dmaengine_terminate_async() and this function. diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt index 67d4ce4df..122b7f487 100644 --- a/Documentation/dmaengine/provider.txt +++ b/Documentation/dmaengine/provider.txt @@ -327,8 +327,24 @@ supported. * device_terminate_all - Aborts all the pending and ongoing transfers on the channel - - This command should operate synchronously on the channel, - terminating right away all the channels + - For aborted transfers the complete callback should not be called + - Can be called from atomic context or from within a complete + callback of a descriptor. Must not sleep. Drivers must be able + to handle this correctly. + - Termination may be asynchronous. The driver does not have to + wait until the currently active transfer has completely stopped. + See device_synchronize. + + * device_synchronize + - Must synchronize the termination of a channel to the current + context. + - Must make sure that memory for previously submitted + descriptors is no longer accessed by the DMA controller. + - Must make sure that all complete callbacks for previously + submitted descriptors have finished running and none are + scheduled to run. + - May sleep. + Misc notes (stuff that should be documented, but don't really know where to put them) diff --git a/Documentation/dvb/README.dvb-usb b/Documentation/dvb/README.dvb-usb index 8eb92264e..669dc6ce4 100644 --- a/Documentation/dvb/README.dvb-usb +++ b/Documentation/dvb/README.dvb-usb @@ -45,7 +45,7 @@ Supported devices See the LinuxTV DVB Wiki at www.linuxtv.org for a complete list of cards/drivers/firmwares: -http://www.linuxtv.org/wiki/index.php/DVB_USB +https://linuxtv.org/wiki/index.php/DVB_USB 0. History & News: 2005-06-30 - added support for WideView WT-220U (Thanks to Steve Chang) @@ -121,7 +121,7 @@ working. Have a look at the Wikipage for the DVB-USB-drivers to find out, which firmware you need for your device: -http://www.linuxtv.org/wiki/index.php/DVB_USB +https://linuxtv.org/wiki/index.php/DVB_USB 1.2. Compiling diff --git a/Documentation/dvb/faq.txt b/Documentation/dvb/faq.txt index 97b1373f2..a0be92012 100644 --- a/Documentation/dvb/faq.txt +++ b/Documentation/dvb/faq.txt @@ -76,7 +76,7 @@ Some very frequently asked questions about linuxtv-dvb the TuxBox CVS many interesting DVB applications and the dBox2 DVB source - http://www.linuxtv.org/downloads/ + https://linuxtv.org/downloads DVB Swiss Army Knife library and utilities http://www.nenie.org/misc/mpsys/ diff --git a/Documentation/dvb/readme.txt b/Documentation/dvb/readme.txt index 0b0380c91..89965041a 100644 --- a/Documentation/dvb/readme.txt +++ b/Documentation/dvb/readme.txt @@ -2,12 +2,12 @@ Linux Digital Video Broadcast (DVB) subsystem ============================================= The main development site and CVS repository for these -drivers is http://linuxtv.org/. +drivers is https://linuxtv.org. The developer mailing list linux-dvb is also hosted there, -see http://linuxtv.org/lists.php. Please check -the archive http://linuxtv.org/pipermail/linux-dvb/ -and the Wiki http://linuxtv.org/wiki/ +see https://linuxtv.org/lists.php. Please check +the archive https://linuxtv.org/pipermail/linux-dvb/ +and the Wiki https://linuxtv.org/wiki/ before asking newbie questions on the list. API documentation, utilities and test/example programs @@ -16,7 +16,7 @@ are available as part of the old driver package for Linux 2.4 We plan to split this into separate packages, but it's not been done yet. -http://linuxtv.org/downloads/ +https://linuxtv.org/downloads/ What's inside this directory: diff --git a/Documentation/edac.txt b/Documentation/edac.txt index 80841a2d6..f89cfd85a 100644 --- a/Documentation/edac.txt +++ b/Documentation/edac.txt @@ -1,9 +1,13 @@ EDAC - Error Detection And Correction ===================================== -"bluesmoke" was the name for this device driver when it was "out-of-tree" -and maintained at sourceforge.net. When it was pushed into 2.6.16 for the -first time, it was renamed to 'EDAC'. +"bluesmoke" was the name for this device driver when it +was "out-of-tree" and maintained at sourceforge.net - +bluesmoke.sourceforge.net. That site is mostly archaic now and can be +used only for historical purposes. + +When the subsystem was pushed into 2.6.16 for the first time, it was +renamed to 'EDAC'. PURPOSE ------- diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.txt index 09adabef5..83d3f4e43 100644 --- a/Documentation/fault-injection/notifier-error-inject.txt +++ b/Documentation/fault-injection/notifier-error-inject.txt @@ -10,6 +10,7 @@ modules that can be used to test the following notifiers. * PM notifier * Memory hotplug notifier * powerpc pSeries reconfig notifier + * Netdevice notifier CPU notifier error injection module ----------------------------------- @@ -87,6 +88,30 @@ Possible pSeries reconfig notifier events to be failed are: * PSERIES_DRCONF_MEM_ADD * PSERIES_DRCONF_MEM_REMOVE +Netdevice notifier error injection module +---------------------------------------------- +This feature is controlled through debugfs interface +/sys/kernel/debug/notifier-error-inject/netdev/actions//error + +Netdevice notifier events which can be failed are: + + * NETDEV_REGISTER + * NETDEV_CHANGEMTU + * NETDEV_CHANGENAME + * NETDEV_PRE_UP + * NETDEV_PRE_TYPE_CHANGE + * NETDEV_POST_INIT + * NETDEV_PRECHANGEMTU + * NETDEV_PRECHANGEUPPER + * NETDEV_CHANGEUPPER + +Example: Inject netdevice mtu change error (-22 == -EINVAL) + + # cd /sys/kernel/debug/notifier-error-inject/netdev + # echo -22 > actions/NETDEV_CHANGEMTU/error + # ip link set eth0 mtu 1024 + RTNETLINK answers: Invalid argument + For more usage examples ----------------------- There are tools/testing/selftests using the notifier error injection features diff --git a/Documentation/features/io/dma_map_attrs/arch-support.txt b/Documentation/features/io/dma_map_attrs/arch-support.txt deleted file mode 100644 index 51d0f1c02..000000000 --- a/Documentation/features/io/dma_map_attrs/arch-support.txt +++ /dev/null @@ -1,40 +0,0 @@ -# -# Feature name: dma_map_attrs -# Kconfig: HAVE_DMA_ATTRS -# description: arch provides dma_*map*_attrs() APIs -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | ok | - | arc: | TODO | - | arm: | ok | - | arm64: | ok | - | avr32: | TODO | - | blackfin: | TODO | - | c6x: | TODO | - | cris: | TODO | - | frv: | TODO | - | h8300: | ok | - | hexagon: | ok | - | ia64: | ok | - | m32r: | TODO | - | m68k: | TODO | - | metag: | TODO | - | microblaze: | ok | - | mips: | ok | - | mn10300: | TODO | - | nios2: | TODO | - | openrisc: | ok | - | parisc: | TODO | - | powerpc: | ok | - | s390: | ok | - | score: | TODO | - | sh: | ok | - | sparc: | ok | - | tile: | ok | - | um: | TODO | - | unicore32: | ok | - | x86: | ok | - | xtensa: | TODO | - ----------------------- diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index 76d39d66a..4f66ec133 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -33,7 +33,7 @@ | sh: | TODO | | sparc: | TODO | | tile: | ok | - | um: | TODO | + | um: | ok | | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index e63316239..4199ffecc 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | .. | | arc: | TODO | | arm: | ok | - | arm64: | .. | + | arm64: | ok | | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt deleted file mode 100644 index 26f74b457..000000000 --- a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt +++ /dev/null @@ -1,40 +0,0 @@ -# -# Feature name: pmdp_splitting_flush -# Kconfig: __HAVE_ARCH_PMDP_SPLITTING_FLUSH -# description: arch supports the pmdp_splitting_flush() VM API -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | TODO | - | arc: | TODO | - | arm: | ok | - | arm64: | ok | - | avr32: | TODO | - | blackfin: | TODO | - | c6x: | TODO | - | cris: | TODO | - | frv: | TODO | - | h8300: | TODO | - | hexagon: | TODO | - | ia64: | TODO | - | m32r: | TODO | - | m68k: | TODO | - | metag: | TODO | - | microblaze: | TODO | - | mips: | ok | - | mn10300: | TODO | - | nios2: | TODO | - | openrisc: | TODO | - | parisc: | TODO | - | powerpc: | ok | - | s390: | ok | - | score: | TODO | - | sh: | TODO | - | sparc: | TODO | - | tile: | TODO | - | um: | TODO | - | unicore32: | TODO | - | x86: | ok | - | xtensa: | TODO | - ----------------------- diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 06d443450..619af9bfd 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -50,8 +50,7 @@ prototypes: int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - const char *(*follow_link) (struct dentry *, void **); - void (*put_link) (struct inode *, void *); + const char *(*get_link) (struct dentry *, struct inode *, void **); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, unsigned int); int (*get_acl)(struct inode *, int); @@ -83,8 +82,7 @@ rmdir: yes (both) (see below) rename: yes (all) (see below) rename2: yes (all) (see below) readlink: no -follow_link: no -put_link: no +get_link: no setattr: yes permission: no (may not block if called in rcu-walk mode) get_acl: no diff --git a/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README deleted file mode 100644 index ed1bafe08..000000000 --- a/Documentation/filesystems/aufs/README +++ /dev/null @@ -1,391 +0,0 @@ - -Aufs4 -- advanced multi layered unification filesystem version 4.x -http://aufs.sf.net -Junjiro R. Okajima - - -0. Introduction ----------------------------------------- -In the early days, aufs was entirely re-designed and re-implemented -Unionfs Version 1.x series. Adding many original ideas, approaches, -improvements and implementations, it becomes totally different from -Unionfs while keeping the basic features. -Recently, Unionfs Version 2.x series begin taking some of the same -approaches to aufs1's. -Unionfs is being developed by Professor Erez Zadok at Stony Brook -University and his team. - -Aufs4 supports linux-4.0 and later, and for linux-3.x series try aufs3. -If you want older kernel version support, try aufs2-2.6.git or -aufs2-standalone.git repository, aufs1 from CVS on SourceForge. - -Note: it becomes clear that "Aufs was rejected. Let's give it up." - According to Christoph Hellwig, linux rejects all union-type - filesystems but UnionMount. - - -PS. Al Viro seems have a plan to merge aufs as well as overlayfs and - UnionMount, and he pointed out an issue around a directory mutex - lock and aufs addressed it. But it is still unsure whether aufs will - be merged (or any other union solution). - - - -1. Features ----------------------------------------- -- unite several directories into a single virtual filesystem. The member - directory is called as a branch. -- you can specify the permission flags to the branch, which are 'readonly', - 'readwrite' and 'whiteout-able.' -- by upper writable branch, internal copyup and whiteout, files/dirs on - readonly branch are modifiable logically. -- dynamic branch manipulation, add, del. -- etc... - -Also there are many enhancements in aufs, such as: -- test only the highest one for the directory permission (dirperm1) -- copyup on open (coo=) -- 'move' policy for copy-up between two writable branches, after - checking free space. -- xattr, acl -- readdir(3) in userspace. -- keep inode number by external inode number table -- keep the timestamps of file/dir in internal copyup operation -- seekable directory, supporting NFS readdir. -- whiteout is hardlinked in order to reduce the consumption of inodes - on branch -- do not copyup, nor create a whiteout when it is unnecessary -- revert a single systemcall when an error occurs in aufs -- remount interface instead of ioctl -- maintain /etc/mtab by an external command, /sbin/mount.aufs. -- loopback mounted filesystem as a branch -- kernel thread for removing the dir who has a plenty of whiteouts -- support copyup sparse file (a file which has a 'hole' in it) -- default permission flags for branches -- selectable permission flags for ro branch, whether whiteout can - exist or not -- export via NFS. -- support /fs/aufs and /aufs. -- support multiple writable branches, some policies to select one - among multiple writable branches. -- a new semantics for link(2) and rename(2) to support multiple - writable branches. -- no glibc changes are required. -- pseudo hardlink (hardlink over branches) -- allow a direct access manually to a file on branch, e.g. bypassing aufs. - including NFS or remote filesystem branch. -- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX. -- and more... - -Currently these features are dropped temporary from aufs4. -See design/08plan.txt in detail. -- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs - (robr) -- statistics of aufs thread (/sys/fs/aufs/stat) - -Features or just an idea in the future (see also design/*.txt), -- reorder the branch index without del/re-add. -- permanent xino files for NFSD -- an option for refreshing the opened files after add/del branches -- light version, without branch manipulation. (unnecessary?) -- copyup in userspace -- inotify in userspace -- readv/writev - - -2. Download ----------------------------------------- -There are three GIT trees for aufs4, aufs4-linux.git, -aufs4-standalone.git, and aufs-util.git. Note that there is no "4" in -"aufs-util.git." -While the aufs-util is always necessary, you need either of aufs4-linux -or aufs4-standalone. - -The aufs4-linux tree includes the whole linux mainline GIT tree, -git://git.kernel.org/.../torvalds/linux.git. -And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot -build aufs4 as an external kernel module. -Several extra patches are not included in this tree. Only -aufs4-standalone tree contains them. They are described in the later -section "Configuration and Compilation." - -On the other hand, the aufs4-standalone tree has only aufs source files -and necessary patches, and you can select CONFIG_AUFS_FS=m. -But you need to apply all aufs patches manually. - -You will find GIT branches whose name is in form of "aufs4.x" where "x" -represents the linux kernel version, "linux-4.x". For instance, -"aufs4.0" is for linux-4.0. For latest "linux-4.x-rcN", use -"aufs4.x-rcN" branch. - -o aufs4-linux tree -$ git clone --reference /your/linux/git/tree \ - git://github.com/sfjro/aufs4-linux.git aufs4-linux.git -- if you don't have linux GIT tree, then remove "--reference ..." -$ cd aufs4-linux.git -$ git checkout origin/aufs4.0 - -Or You may want to directly git-pull aufs into your linux GIT tree, and -leave the patch-work to GIT. -$ cd /your/linux/git/tree -$ git remote add aufs4 git://github.com/sfjro/aufs4-linux.git -$ git fetch aufs4 -$ git checkout -b my4.0 v4.0 -$ (add your local change...) -$ git pull aufs4 aufs4.0 -- now you have v4.0 + your_changes + aufs4.0 in you my4.0 branch. -- you may need to solve some conflicts between your_changes and - aufs4.0. in this case, git-rerere is recommended so that you can - solve the similar conflicts automatically when you upgrade to 4.1 or - later in the future. - -o aufs4-standalone tree -$ git clone git://github.com/sfjro/aufs4-standalone.git aufs4-standalone.git -$ cd aufs4-standalone.git -$ git checkout origin/aufs4.0 - -o aufs-util tree -$ git clone git://git.code.sf.net/p/aufs/aufs-util aufs-util.git -- note that the public aufs-util.git is on SourceForge instead of - GitHUB. -$ cd aufs-util.git -$ git checkout origin/aufs4.0 - -Note: The 4.x-rcN branch is to be used with `rc' kernel versions ONLY. -The minor version number, 'x' in '4.x', of aufs may not always -follow the minor version number of the kernel. -Because changes in the kernel that cause the use of a new -minor version number do not always require changes to aufs-util. - -Since aufs-util has its own minor version number, you may not be -able to find a GIT branch in aufs-util for your kernel's -exact minor version number. -In this case, you should git-checkout the branch for the -nearest lower number. - -For (an unreleased) example: -If you are using "linux-4.10" and the "aufs4.10" branch -does not exist in aufs-util repository, then "aufs4.9", "aufs4.8" -or something numerically smaller is the branch for your kernel. - -Also you can view all branches by - $ git branch -a - - -3. Configuration and Compilation ----------------------------------------- -Make sure you have git-checkout'ed the correct branch. - -For aufs4-linux tree, -- enable CONFIG_AUFS_FS. -- set other aufs configurations if necessary. - -For aufs4-standalone tree, -There are several ways to build. - -1. -- apply ./aufs4-kbuild.patch to your kernel source files. -- apply ./aufs4-base.patch too. -- apply ./aufs4-mmap.patch too. -- apply ./aufs4-standalone.patch too, if you have a plan to set - CONFIG_AUFS_FS=m. otherwise you don't need ./aufs4-standalone.patch. -- copy ./{Documentation,fs,include/uapi/linux/aufs_type.h} files to your - kernel source tree. Never copy $PWD/include/uapi/linux/Kbuild. -- enable CONFIG_AUFS_FS, you can select either - =m or =y. -- and build your kernel as usual. -- install the built kernel. - Note: Since linux-3.9, every filesystem module requires an alias - "fs-". You should make sure that "fs-aufs" is listed in your - modules.aliases file if you set CONFIG_AUFS_FS=m. -- install the header files too by "make headers_install" to the - directory where you specify. By default, it is $PWD/usr. - "make help" shows a brief note for headers_install. -- and reboot your system. - -2. -- module only (CONFIG_AUFS_FS=m). -- apply ./aufs4-base.patch to your kernel source files. -- apply ./aufs4-mmap.patch too. -- apply ./aufs4-standalone.patch too. -- build your kernel, don't forget "make headers_install", and reboot. -- edit ./config.mk and set other aufs configurations if necessary. - Note: You should read $PWD/fs/aufs/Kconfig carefully which describes - every aufs configurations. -- build the module by simple "make". - Note: Since linux-3.9, every filesystem module requires an alias - "fs-". You should make sure that "fs-aufs" is listed in your - modules.aliases file. -- you can specify ${KDIR} make variable which points to your kernel - source tree. -- install the files - + run "make install" to install the aufs module, or copy the built - $PWD/aufs.ko to /lib/modules/... and run depmod -a (or reboot simply). - + run "make install_headers" (instead of headers_install) to install - the modified aufs header file (you can specify DESTDIR which is - available in aufs standalone version's Makefile only), or copy - $PWD/usr/include/linux/aufs_type.h to /usr/include/linux or wherever - you like manually. By default, the target directory is $PWD/usr. -- no need to apply aufs4-kbuild.patch, nor copying source files to your - kernel source tree. - -Note: The header file aufs_type.h is necessary to build aufs-util - as well as "make headers_install" in the kernel source tree. - headers_install is subject to be forgotten, but it is essentially - necessary, not only for building aufs-util. - You may not meet problems without headers_install in some older - version though. - -And then, -- read README in aufs-util, build and install it -- note that your distribution may contain an obsoleted version of - aufs_type.h in /usr/include/linux or something. When you build aufs - utilities, make sure that your compiler refers the correct aufs header - file which is built by "make headers_install." -- if you want to use readdir(3) in userspace or pathconf(3) wrapper, - then run "make install_ulib" too. And refer to the aufs manual in - detail. - -There several other patches in aufs4-standalone.git. They are all -optional. When you meet some problems, they will help you. -- aufs4-loopback.patch - Supports a nested loopback mount in a branch-fs. This patch is - unnecessary until aufs produces a message like "you may want to try - another patch for loopback file". -- vfs-ino.patch - Modifies a system global kernel internal function get_next_ino() in - order to stop assigning 0 for an inode-number. Not directly related to - aufs, but recommended generally. -- tmpfs-idr.patch - Keeps the tmpfs inode number as the lowest value. Effective to reduce - the size of aufs XINO files for tmpfs branch. Also it prevents the - duplication of inode number, which is important for backup tools and - other utilities. When you find aufs XINO files for tmpfs branch - growing too much, try this patch. -- lockdep-debug.patch - Because aufs is not only an ordinary filesystem (callee of VFS), but - also a caller of VFS functions for branch filesystems, subclassing of - the internal locks for LOCKDEP is necessary. LOCKDEP is a debugging - feature of linux kernel. If you enable CONFIG_LOCKDEP, then you will - need to apply this debug patch to expand several constant values. - If don't know what LOCKDEP, then you don't have apply this patch. - - -4. Usage ----------------------------------------- -At first, make sure aufs-util are installed, and please read the aufs -manual, aufs.5 in aufs-util.git tree. -$ man -l aufs.5 - -And then, -$ mkdir /tmp/rw /tmp/aufs -# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs - -Here is another example. The result is equivalent. -# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs - Or -# mount -t aufs -o br:/tmp/rw none /tmp/aufs -# mount -o remount,append:${HOME} /tmp/aufs - -Then, you can see whole tree of your home dir through /tmp/aufs. If -you modify a file under /tmp/aufs, the one on your home directory is -not affected, instead the same named file will be newly created under -/tmp/rw. And all of your modification to a file will be applied to -the one under /tmp/rw. This is called the file based Copy on Write -(COW) method. -Aufs mount options are described in aufs.5. -If you run chroot or something and make your aufs as a root directory, -then you need to customize the shutdown script. See the aufs manual in -detail. - -Additionally, there are some sample usages of aufs which are a -diskless system with network booting, and LiveCD over NFS. -See sample dir in CVS tree on SourceForge. - - -5. Contact ----------------------------------------- -When you have any problems or strange behaviour in aufs, please let me -know with: -- /proc/mounts (instead of the output of mount(8)) -- /sys/module/aufs/* -- /sys/fs/aufs/* (if you have them) -- /debug/aufs/* (if you have them) -- linux kernel version - if your kernel is not plain, for example modified by distributor, - the url where i can download its source is necessary too. -- aufs version which was printed at loading the module or booting the - system, instead of the date you downloaded. -- configuration (define/undefine CONFIG_AUFS_xxx) -- kernel configuration or /proc/config.gz (if you have it) -- behaviour which you think to be incorrect -- actual operation, reproducible one is better -- mailto: aufs-users at lists.sourceforge.net - -Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches, -and Feature Requests) on SourceForge. Please join and write to -aufs-users ML. - - -6. Acknowledgements ----------------------------------------- -Thanks to everyone who have tried and are using aufs, whoever -have reported a bug or any feedback. - -Especially donators: -Tomas Matejicek(slax.org) made a donation (much more than once). - Since Apr 2010, Tomas M (the author of Slax and Linux Live - scripts) is making "doubling" donations. - Unfortunately I cannot list all of the donators, but I really - appreciate. - It ends Aug 2010, but the ordinary donation URL is still available. - -Dai Itasaka made a donation (2007/8). -Chuck Smith made a donation (2008/4, 10 and 12). -Henk Schoneveld made a donation (2008/9). -Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10). -Francois Dupoux made a donation (2008/11). -Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public - aufs2 GIT tree (2009/2). -William Grant made a donation (2009/3). -Patrick Lane made a donation (2009/4). -The Mail Archive (mail-archive.com) made donations (2009/5). -Nippy Networks (Ed Wildgoose) made a donation (2009/7). -New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11). -Pavel Pronskiy made a donation (2011/2). -Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy - Networks (Ed Wildgoose) made a donation for hardware (2011/3). -Max Lekomcev (DOM-TV project) made a donation (2011/7, 12, 2012/3, 6 and -11). -Sam Liddicott made a donation (2011/9). -Era Scarecrow made a donation (2013/4). -Bor Ratajc made a donation (2013/4). -Alessandro Gorreta made a donation (2013/4). -POIRETTE Marc made a donation (2013/4). -Alessandro Gorreta made a donation (2013/4). -lauri kasvandik made a donation (2013/5). -"pemasu from Finland" made a donation (2013/7). -The Parted Magic Project made a donation (2013/9 and 11). -Pavel Barta made a donation (2013/10). -Nikolay Pertsev made a donation (2014/5). -James B made a donation (2014/7 and 2015/7). -Stefano Di Biase made a donation (2014/8). -Daniel Epellei made a donation (2015/1). -OmegaPhil made a donation (2016/1). - -Thank you very much. -Donations are always, including future donations, very important and -helpful for me to keep on developing aufs. - - -7. ----------------------------------------- -If you are an experienced user, no explanation is needed. Aufs is -just a linux filesystem. - - -Enjoy! - -# Local variables: ; -# mode: text; -# End: ; diff --git a/Documentation/filesystems/aufs/design/01intro.txt b/Documentation/filesystems/aufs/design/01intro.txt deleted file mode 100644 index 5d0121439..000000000 --- a/Documentation/filesystems/aufs/design/01intro.txt +++ /dev/null @@ -1,157 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Introduction ----------------------------------------- - -aufs [ei ju: ef es] | [a u f s] -1. abbrev. for "advanced multi-layered unification filesystem". -2. abbrev. for "another unionfs". -3. abbrev. for "auf das" in German which means "on the" in English. - Ex. "Butter aufs Brot"(G) means "butter onto bread"(E). - But "Filesystem aufs Filesystem" is hard to understand. - -AUFS is a filesystem with features: -- multi layered stackable unification filesystem, the member directory - is called as a branch. -- branch permission and attribute, 'readonly', 'real-readonly', - 'readwrite', 'whiteout-able', 'link-able whiteout', etc. and their - combination. -- internal "file copy-on-write". -- logical deletion, whiteout. -- dynamic branch manipulation, adding, deleting and changing permission. -- allow bypassing aufs, user's direct branch access. -- external inode number translation table and bitmap which maintains the - persistent aufs inode number. -- seekable directory, including NFS readdir. -- file mapping, mmap and sharing pages. -- pseudo-link, hardlink over branches. -- loopback mounted filesystem as a branch. -- several policies to select one among multiple writable branches. -- revert a single systemcall when an error occurs in aufs. -- and more... - - -Multi Layered Stackable Unification Filesystem ----------------------------------------------------------------------- -Most people already knows what it is. -It is a filesystem which unifies several directories and provides a -merged single directory. When users access a file, the access will be -passed/re-directed/converted (sorry, I am not sure which English word is -correct) to the real file on the member filesystem. The member -filesystem is called 'lower filesystem' or 'branch' and has a mode -'readonly' and 'readwrite.' And the deletion for a file on the lower -readonly branch is handled by creating 'whiteout' on the upper writable -branch. - -On LKML, there have been discussions about UnionMount (Jan Blunck, -Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took -different approaches to implement the merged-view. -The former tries putting it into VFS, and the latter implements as a -separate filesystem. -(If I misunderstand about these implementations, please let me know and -I shall correct it. Because it is a long time ago when I read their -source files last time). - -UnionMount's approach will be able to small, but may be hard to share -branches between several UnionMount since the whiteout in it is -implemented in the inode on branch filesystem and always -shared. According to Bharata's post, readdir does not seems to be -finished yet. -There are several missing features known in this implementations such as -- for users, the inode number may change silently. eg. copy-up. -- link(2) may break by copy-up. -- read(2) may get an obsoleted filedata (fstat(2) too). -- fcntl(F_SETLK) may be broken by copy-up. -- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after - open(O_RDWR). - -In linux-3.18, "overlay" filesystem (formerly known as "overlayfs") was -merged into mainline. This is another implementation of UnionMount as a -separated filesystem. All the limitations and known problems which -UnionMount are equally inherited to "overlay" filesystem. - -Unionfs has a longer history. When I started implementing a stackable -filesystem (Aug 2005), it already existed. It has virtual super_block, -inode, dentry and file objects and they have an array pointing lower -same kind objects. After contributing many patches for Unionfs, I -re-started my project AUFS (Jun 2006). - -In AUFS, the structure of filesystem resembles to Unionfs, but I -implemented my own ideas, approaches and enhancements and it became -totally different one. - -Comparing DM snapshot and fs based implementation -- the number of bytes to be copied between devices is much smaller. -- the type of filesystem must be one and only. -- the fs must be writable, no readonly fs, even for the lower original - device. so the compression fs will not be usable. but if we use - loopback mount, we may address this issue. - for instance, - mount /cdrom/squashfs.img /sq - losetup /sq/ext2.img - losetup /somewhere/cow - dmsetup "snapshot /dev/loop0 /dev/loop1 ..." -- it will be difficult (or needs more operations) to extract the - difference between the original device and COW. -- DM snapshot-merge may help a lot when users try merging. in the - fs-layer union, users will use rsync(1). - -You may want to read my old paper "Filesystems in LiveCD" -(http://aufs.sourceforge.net/aufs2/report/sq/sq.pdf). - - -Several characters/aspects/persona of aufs ----------------------------------------------------------------------- - -Aufs has several characters, aspects or persona. -1. a filesystem, callee of VFS helper -2. sub-VFS, caller of VFS helper for branches -3. a virtual filesystem which maintains persistent inode number -4. reader/writer of files on branches such like an application - -1. Callee of VFS Helper -As an ordinary linux filesystem, aufs is a callee of VFS. For instance, -unlink(2) from an application reaches sys_unlink() kernel function and -then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it -calls filesystem specific unlink operation. Actually aufs implements the -unlink operation but it behaves like a redirector. - -2. Caller of VFS Helper for Branches -aufs_unlink() passes the unlink request to the branch filesystem as if -it were called from VFS. So the called unlink operation of the branch -filesystem acts as usual. As a caller of VFS helper, aufs should handle -every necessary pre/post operation for the branch filesystem. -- acquire the lock for the parent dir on a branch -- lookup in a branch -- revalidate dentry on a branch -- mnt_want_write() for a branch -- vfs_unlink() for a branch -- mnt_drop_write() for a branch -- release the lock on a branch - -3. Persistent Inode Number -One of the most important issue for a filesystem is to maintain inode -numbers. This is particularly important to support exporting a -filesystem via NFS. Aufs is a virtual filesystem which doesn't have a -backend block device for its own. But some storage is necessary to -keep and maintain the inode numbers. It may be a large space and may not -suit to keep in memory. Aufs rents some space from its first writable -branch filesystem (by default) and creates file(s) on it. These files -are created by aufs internally and removed soon (currently) keeping -opened. -Note: Because these files are removed, they are totally gone after - unmounting aufs. It means the inode numbers are not persistent - across unmount or reboot. I have a plan to make them really - persistent which will be important for aufs on NFS server. - -4. Read/Write Files Internally (copy-on-write) -Because a branch can be readonly, when you write a file on it, aufs will -"copy-up" it to the upper writable branch internally. And then write the -originally requested thing to the file. Generally kernel doesn't -open/read/write file actively. In aufs, even a single write may cause a -internal "file copy". This behaviour is very similar to cp(1) command. - -Some people may think it is better to pass such work to user space -helper, instead of doing in kernel space. Actually I am still thinking -about it. But currently I have implemented it in kernel space. diff --git a/Documentation/filesystems/aufs/design/02struct.txt b/Documentation/filesystems/aufs/design/02struct.txt deleted file mode 100644 index 783328a75..000000000 --- a/Documentation/filesystems/aufs/design/02struct.txt +++ /dev/null @@ -1,245 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Basic Aufs Internal Structure - -Superblock/Inode/Dentry/File Objects ----------------------------------------------------------------------- -As like an ordinary filesystem, aufs has its own -superblock/inode/dentry/file objects. All these objects have a -dynamically allocated array and store the same kind of pointers to the -lower filesystem, branch. -For example, when you build a union with one readwrite branch and one -readonly, mounted /au, /rw and /ro respectively. -- /au = /rw + /ro -- /ro/fileA exists but /rw/fileA - -Aufs lookup operation finds /ro/fileA and gets dentry for that. These -pointers are stored in a aufs dentry. The array in aufs dentry will be, -- [0] = NULL (because /rw/fileA doesn't exist) -- [1] = /ro/fileA - -This style of an array is essentially same to the aufs -superblock/inode/dentry/file objects. - -Because aufs supports manipulating branches, ie. add/delete/change -branches dynamically, these objects has its own generation. When -branches are changed, the generation in aufs superblock is -incremented. And a generation in other object are compared when it is -accessed. When a generation in other objects are obsoleted, aufs -refreshes the internal array. - - -Superblock ----------------------------------------------------------------------- -Additionally aufs superblock has some data for policies to select one -among multiple writable branches, XIB files, pseudo-links and kobject. -See below in detail. -About the policies which supports copy-down a directory, see -wbr_policy.txt too. - - -Branch and XINO(External Inode Number Translation Table) ----------------------------------------------------------------------- -Every branch has its own xino (external inode number translation table) -file. The xino file is created and unlinked by aufs internally. When two -members of a union exist on the same filesystem, they share the single -xino file. -The struct of a xino file is simple, just a sequence of aufs inode -numbers which is indexed by the lower inode number. -In the above sample, assume the inode number of /ro/fileA is i111 and -aufs assigns the inode number i999 for fileA. Then aufs writes 999 as -4(8) bytes at 111 * 4(8) bytes offset in the xino file. - -When the inode numbers are not contiguous, the xino file will be sparse -which has a hole in it and doesn't consume as much disk space as it -might appear. If your branch filesystem consumes disk space for such -holes, then you should specify 'xino=' option at mounting aufs. - -Aufs has a mount option to free the disk blocks for such holes in XINO -files on tmpfs or ramdisk. But it is not so effective actually. If you -meet a problem of disk shortage due to XINO files, then you should try -"tmpfs-ino.patch" (and "vfs-ino.patch" too) in aufs4-standalone.git. -The patch localizes the assignment inumbers per tmpfs-mount and avoid -the holes in XINO files. - -Also a writable branch has three kinds of "whiteout bases". All these -are existed when the branch is joined to aufs, and their names are -whiteout-ed doubly, so that users will never see their names in aufs -hierarchy. -1. a regular file which will be hardlinked to all whiteouts. -2. a directory to store a pseudo-link. -3. a directory to store an "orphan"-ed file temporary. - -1. Whiteout Base - When you remove a file on a readonly branch, aufs handles it as a - logical deletion and creates a whiteout on the upper writable branch - as a hardlink of this file in order not to consume inode on the - writable branch. -2. Pseudo-link Dir - See below, Pseudo-link. -3. Step-Parent Dir - When "fileC" exists on the lower readonly branch only and it is - opened and removed with its parent dir, and then user writes - something into it, then aufs copies-up fileC to this - directory. Because there is no other dir to store fileC. After - creating a file under this dir, the file is unlinked. - -Because aufs supports manipulating branches, ie. add/delete/change -dynamically, a branch has its own id. When the branch order changes, -aufs finds the new index by searching the branch id. - - -Pseudo-link ----------------------------------------------------------------------- -Assume "fileA" exists on the lower readonly branch only and it is -hardlinked to "fileB" on the branch. When you write something to fileA, -aufs copies-up it to the upper writable branch. Additionally aufs -creates a hardlink under the Pseudo-link Directory of the writable -branch. The inode of a pseudo-link is kept in aufs super_block as a -simple list. If fileB is read after unlinking fileA, aufs returns -filedata from the pseudo-link instead of the lower readonly -branch. Because the pseudo-link is based upon the inode, to keep the -inode number by xino (see above) is essentially necessary. - -All the hardlinks under the Pseudo-link Directory of the writable branch -should be restored in a proper location later. Aufs provides a utility -to do this. The userspace helpers executed at remounting and unmounting -aufs by default. -During this utility is running, it puts aufs into the pseudo-link -maintenance mode. In this mode, only the process which began the -maintenance mode (and its child processes) is allowed to operate in -aufs. Some other processes which are not related to the pseudo-link will -be allowed to run too, but the rest have to return an error or wait -until the maintenance mode ends. If a process already acquires an inode -mutex (in VFS), it has to return an error. - - -XIB(external inode number bitmap) ----------------------------------------------------------------------- -Addition to the xino file per a branch, aufs has an external inode number -bitmap in a superblock object. It is also an internal file such like a -xino file. -It is a simple bitmap to mark whether the aufs inode number is in-use or -not. -To reduce the file I/O, aufs prepares a single memory page to cache xib. - -As well as XINO files, aufs has a feature to truncate/refresh XIB to -reduce the number of consumed disk blocks for these files. - - -Virtual or Vertical Dir, and Readdir in Userspace ----------------------------------------------------------------------- -In order to support multiple layers (branches), aufs readdir operation -constructs a virtual dir block on memory. For readdir, aufs calls -vfs_readdir() internally for each dir on branches, merges their entries -with eliminating the whiteout-ed ones, and sets it to file (dir) -object. So the file object has its entry list until it is closed. The -entry list will be updated when the file position is zero and becomes -obsoleted. This decision is made in aufs automatically. - -The dynamically allocated memory block for the name of entries has a -unit of 512 bytes (by default) and stores the names contiguously (no -padding). Another block for each entry is handled by kmem_cache too. -During building dir blocks, aufs creates hash list and judging whether -the entry is whiteouted by its upper branch or already listed. -The merged result is cached in the corresponding inode object and -maintained by a customizable life-time option. - -Some people may call it can be a security hole or invite DoS attack -since the opened and once readdir-ed dir (file object) holds its entry -list and becomes a pressure for system memory. But I'd say it is similar -to files under /proc or /sys. The virtual files in them also holds a -memory page (generally) while they are opened. When an idea to reduce -memory for them is introduced, it will be applied to aufs too. -For those who really hate this situation, I've developed readdir(3) -library which operates this merging in userspace. You just need to set -LD_PRELOAD environment variable, and aufs will not consume no memory in -kernel space for readdir(3). - - -Workqueue ----------------------------------------------------------------------- -Aufs sometimes requires privilege access to a branch. For instance, -in copy-up/down operation. When a user process is going to make changes -to a file which exists in the lower readonly branch only, and the mode -of one of ancestor directories may not be writable by a user -process. Here aufs copy-up the file with its ancestors and they may -require privilege to set its owner/group/mode/etc. -This is a typical case of a application character of aufs (see -Introduction). - -Aufs uses workqueue synchronously for this case. It creates its own -workqueue. The workqueue is a kernel thread and has privilege. Aufs -passes the request to call mkdir or write (for example), and wait for -its completion. This approach solves a problem of a signal handler -simply. -If aufs didn't adopt the workqueue and changed the privilege of the -process, then the process may receive the unexpected SIGXFSZ or other -signals. - -Also aufs uses the system global workqueue ("events" kernel thread) too -for asynchronous tasks, such like handling inotify/fsnotify, re-creating a -whiteout base and etc. This is unrelated to a privilege. -Most of aufs operation tries acquiring a rw_semaphore for aufs -superblock at the beginning, at the same time waits for the completion -of all queued asynchronous tasks. - - -Whiteout ----------------------------------------------------------------------- -The whiteout in aufs is very similar to Unionfs's. That is represented -by its filename. UnionMount takes an approach of a file mode, but I am -afraid several utilities (find(1) or something) will have to support it. - -Basically the whiteout represents "logical deletion" which stops aufs to -lookup further, but also it represents "dir is opaque" which also stop -further lookup. - -In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively. -In order to make several functions in a single systemcall to be -revertible, aufs adopts an approach to rename a directory to a temporary -unique whiteouted name. -For example, in rename(2) dir where the target dir already existed, aufs -renames the target dir to a temporary unique whiteouted name before the -actual rename on a branch, and then handles other actions (make it opaque, -update the attributes, etc). If an error happens in these actions, aufs -simply renames the whiteouted name back and returns an error. If all are -succeeded, aufs registers a function to remove the whiteouted unique -temporary name completely and asynchronously to the system global -workqueue. - - -Copy-up ----------------------------------------------------------------------- -It is a well-known feature or concept. -When user modifies a file on a readonly branch, aufs operate "copy-up" -internally and makes change to the new file on the upper writable branch. -When the trigger systemcall does not update the timestamps of the parent -dir, aufs reverts it after copy-up. - - -Move-down (aufs3.9 and later) ----------------------------------------------------------------------- -"Copy-up" is one of the essential feature in aufs. It copies a file from -the lower readonly branch to the upper writable branch when a user -changes something about the file. -"Move-down" is an opposite action of copy-up. Basically this action is -ran manually instead of automatically and internally. -For desgin and implementation, aufs has to consider these issues. -- whiteout for the file may exist on the lower branch. -- ancestor directories may not exist on the lower branch. -- diropq for the ancestor directories may exist on the upper branch. -- free space on the lower branch will reduce. -- another access to the file may happen during moving-down, including - UDBA (see "Revalidate Dentry and UDBA"). -- the file should not be hard-linked nor pseudo-linked. they should be - handled by auplink utility later. - -Sometimes users want to move-down a file from the upper writable branch -to the lower readonly or writable branch. For instance, -- the free space of the upper writable branch is going to run out. -- create a new intermediate branch between the upper and lower branch. -- etc. - -For this purpose, use "aumvdown" command in aufs-util.git. diff --git a/Documentation/filesystems/aufs/design/03atomic_open.txt b/Documentation/filesystems/aufs/design/03atomic_open.txt deleted file mode 100644 index 741ad6d66..000000000 --- a/Documentation/filesystems/aufs/design/03atomic_open.txt +++ /dev/null @@ -1,72 +0,0 @@ - -# Copyright (C) 2015-2016 Junjiro R. Okajima - -Support for a branch who has its ->atomic_open() ----------------------------------------------------------------------- -The filesystems who implement its ->atomic_open() are not majority. For -example NFSv4 does, and aufs should call NFSv4 ->atomic_open, -particularly for open(O_CREAT|O_EXCL, 0400) case. Other than -->atomic_open(), NFSv4 returns an error for this open(2). While I am not -sure whether all filesystems who have ->atomic_open() behave like this, -but NFSv4 surely returns the error. - -In order to support ->atomic_open() for aufs, there are a few -approaches. - -A. Introduce aufs_atomic_open() - - calls one of VFS:do_last(), lookup_open() or atomic_open() for - branch fs. -B. Introduce aufs_atomic_open() calling create, open and chmod. this is - an aufs user Pip Cet's approach - - calls aufs_create(), VFS finish_open() and notify_change(). - - pass fake-mode to finish_open(), and then correct the mode by - notify_change(). -C. Extend aufs_open() to call branch fs's ->atomic_open() - - no aufs_atomic_open(). - - aufs_lookup() registers the TID to an aufs internal object. - - aufs_create() does nothing when the matching TID is registered, but - registers the mode. - - aufs_open() calls branch fs's ->atomic_open() when the matching - TID is registered. -D. Extend aufs_open() to re-try branch fs's ->open() with superuser's - credential - - no aufs_atomic_open(). - - aufs_create() registers the TID to an internal object. this info - represents "this process created this file just now." - - when aufs gets EACCES from branch fs's ->open(), then confirm the - registered TID and re-try open() with superuser's credential. - -Pros and cons for each approach. - -A. - - straightforward but highly depends upon VFS internal. - - the atomic behavaiour is kept. - - some of parameters such as nameidata are hard to reproduce for - branch fs. - - large overhead. -B. - - easy to implement. - - the atomic behavaiour is lost. -C. - - the atomic behavaiour is kept. - - dirty and tricky. - - VFS checks whether the file is created correctly after calling - ->create(), which means this approach doesn't work. -D. - - easy to implement. - - the atomic behavaiour is lost. - - to open a file with superuser's credential and give it to a user - process is a bad idea, since the file object keeps the credential - in it. It may affect LSM or something. This approach doesn't work - either. - -The approach A is ideal, but it hard to implement. So here is a -variation of A, which is to be implemented. - -A-1. Introduce aufs_atomic_open() - - calls branch fs ->atomic_open() if exists. otherwise calls - vfs_create() and finish_open(). - - the demerit is that the several checks after branch fs - ->atomic_open() are lost. in the ordinary case, the checks are - done by VFS:do_last(), lookup_open() and atomic_open(). some can - be implemented in aufs, but not all I am afraid. diff --git a/Documentation/filesystems/aufs/design/03lookup.txt b/Documentation/filesystems/aufs/design/03lookup.txt deleted file mode 100644 index 5b6b000b5..000000000 --- a/Documentation/filesystems/aufs/design/03lookup.txt +++ /dev/null @@ -1,100 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Lookup in a Branch ----------------------------------------------------------------------- -Since aufs has a character of sub-VFS (see Introduction), it operates -lookup for branches as VFS does. It may be a heavy work. But almost all -lookup operation in aufs is the simplest case, ie. lookup only an entry -directly connected to its parent. Digging down the directory hierarchy -is unnecessary. VFS has a function lookup_one_len() for that use, and -aufs calls it. - -When a branch is a remote filesystem, aufs basically relies upon its -->d_revalidate(), also aufs forces the hardest revalidate tests for -them. -For d_revalidate, aufs implements three levels of revalidate tests. See -"Revalidate Dentry and UDBA" in detail. - - -Test Only the Highest One for the Directory Permission (dirperm1 option) ----------------------------------------------------------------------- -Let's try case study. -- aufs has two branches, upper readwrite and lower readonly. - /au = /rw + /ro -- "dirA" exists under /ro, but /rw. and its mode is 0700. -- user invoked "chmod a+rx /au/dirA" -- the internal copy-up is activated and "/rw/dirA" is created and its - permission bits are set to world readable. -- then "/au/dirA" becomes world readable? - -In this case, /ro/dirA is still 0700 since it exists in readonly branch, -or it may be a natively readonly filesystem. If aufs respects the lower -branch, it should not respond readdir request from other users. But user -allowed it by chmod. Should really aufs rejects showing the entries -under /ro/dirA? - -To be honest, I don't have a good solution for this case. So aufs -implements 'dirperm1' and 'nodirperm1' mount options, and leave it to -users. -When dirperm1 is specified, aufs checks only the highest one for the -directory permission, and shows the entries. Otherwise, as usual, checks -every dir existing on all branches and rejects the request. - -As a side effect, dirperm1 option improves the performance of aufs -because the number of permission check is reduced when the number of -branch is many. - - -Revalidate Dentry and UDBA (User's Direct Branch Access) ----------------------------------------------------------------------- -Generally VFS helpers re-validate a dentry as a part of lookup. -0. digging down the directory hierarchy. -1. lock the parent dir by its i_mutex. -2. lookup the final (child) entry. -3. revalidate it. -4. call the actual operation (create, unlink, etc.) -5. unlock the parent dir - -If the filesystem implements its ->d_revalidate() (step 3), then it is -called. Actually aufs implements it and checks the dentry on a branch is -still valid. -But it is not enough. Because aufs has to release the lock for the -parent dir on a branch at the end of ->lookup() (step 2) and -->d_revalidate() (step 3) while the i_mutex of the aufs dir is still -held by VFS. -If the file on a branch is changed directly, eg. bypassing aufs, after -aufs released the lock, then the subsequent operation may cause -something unpleasant result. - -This situation is a result of VFS architecture, ->lookup() and -->d_revalidate() is separated. But I never say it is wrong. It is a good -design from VFS's point of view. It is just not suitable for sub-VFS -character in aufs. - -Aufs supports such case by three level of revalidation which is -selectable by user. -1. Simple Revalidate - Addition to the native flow in VFS's, confirm the child-parent - relationship on the branch just after locking the parent dir on the - branch in the "actual operation" (step 4). When this validation - fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still - checks the validation of the dentry on branches. -2. Monitor Changes Internally by Inotify/Fsnotify - Addition to above, in the "actual operation" (step 4) aufs re-lookup - the dentry on the branch, and returns EBUSY if it finds different - dentry. - Additionally, aufs sets the inotify/fsnotify watch for every dir on branches - during it is in cache. When the event is notified, aufs registers a - function to kernel 'events' thread by schedule_work(). And the - function sets some special status to the cached aufs dentry and inode - private data. If they are not cached, then aufs has nothing to - do. When the same file is accessed through aufs (step 0-3) later, - aufs will detect the status and refresh all necessary data. - In this mode, aufs has to ignore the event which is fired by aufs - itself. -3. No Extra Validation - This is the simplest test and doesn't add any additional revalidation - test, and skip the revalidation in step 4. It is useful and improves - aufs performance when system surely hide the aufs branches from user, - by over-mounting something (or another method). diff --git a/Documentation/filesystems/aufs/design/04branch.txt b/Documentation/filesystems/aufs/design/04branch.txt deleted file mode 100644 index e68f4d3df..000000000 --- a/Documentation/filesystems/aufs/design/04branch.txt +++ /dev/null @@ -1,61 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Branch Manipulation - -Since aufs supports dynamic branch manipulation, ie. add/remove a branch -and changing its permission/attribute, there are a lot of works to do. - - -Add a Branch ----------------------------------------------------------------------- -o Confirm the adding dir exists outside of aufs, including loopback - mount, and its various attributes. -o Initialize the xino file and whiteout bases if necessary. - See struct.txt. - -o Check the owner/group/mode of the directory - When the owner/group/mode of the adding directory differs from the - existing branch, aufs issues a warning because it may impose a - security risk. - For example, when a upper writable branch has a world writable empty - top directory, a malicious user can create any files on the writable - branch directly, like copy-up and modify manually. If something like - /etc/{passwd,shadow} exists on the lower readonly branch but the upper - writable branch, and the writable branch is world-writable, then a - malicious guy may create /etc/passwd on the writable branch directly - and the infected file will be valid in aufs. - I am afraid it can be a security issue, but aufs can do nothing except - producing a warning. - - -Delete a Branch ----------------------------------------------------------------------- -o Confirm the deleting branch is not busy - To be general, there is one merit to adopt "remount" interface to - manipulate branches. It is to discard caches. At deleting a branch, - aufs checks the still cached (and connected) dentries and inodes. If - there are any, then they are all in-use. An inode without its - corresponding dentry can be alive alone (for example, inotify/fsnotify case). - - For the cached one, aufs checks whether the same named entry exists on - other branches. - If the cached one is a directory, because aufs provides a merged view - to users, as long as one dir is left on any branch aufs can show the - dir to users. In this case, the branch can be removed from aufs. - Otherwise aufs rejects deleting the branch. - - If any file on the deleting branch is opened by aufs, then aufs - rejects deleting. - - -Modify the Permission of a Branch ----------------------------------------------------------------------- -o Re-initialize or remove the xino file and whiteout bases if necessary. - See struct.txt. - -o rw --> ro: Confirm the modifying branch is not busy - Aufs rejects the request if any of these conditions are true. - - a file on the branch is mmap-ed. - - a regular file on the branch is opened for write and there is no - same named entry on the upper branch. diff --git a/Documentation/filesystems/aufs/design/05wbr_policy.txt b/Documentation/filesystems/aufs/design/05wbr_policy.txt deleted file mode 100644 index 1726d5d06..000000000 --- a/Documentation/filesystems/aufs/design/05wbr_policy.txt +++ /dev/null @@ -1,51 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Policies to Select One among Multiple Writable Branches ----------------------------------------------------------------------- -When the number of writable branch is more than one, aufs has to decide -the target branch for file creation or copy-up. By default, the highest -writable branch which has the parent (or ancestor) dir of the target -file is chosen (top-down-parent policy). -By user's request, aufs implements some other policies to select the -writable branch, for file creation several policies, round-robin, -most-free-space, and other policies. For copy-up, top-down-parent, -bottom-up-parent, bottom-up and others. - -As expected, the round-robin policy selects the branch in circular. When -you have two writable branches and creates 10 new files, 5 files will be -created for each branch. mkdir(2) systemcall is an exception. When you -create 10 new directories, all will be created on the same branch. -And the most-free-space policy selects the one which has most free -space among the writable branches. The amount of free space will be -checked by aufs internally, and users can specify its time interval. - -The policies for copy-up is more simple, -top-down-parent is equivalent to the same named on in create policy, -bottom-up-parent selects the writable branch where the parent dir -exists and the nearest upper one from the copyup-source, -bottom-up selects the nearest upper writable branch from the -copyup-source, regardless the existence of the parent dir. - -There are some rules or exceptions to apply these policies. -- If there is a readonly branch above the policy-selected branch and - the parent dir is marked as opaque (a variation of whiteout), or the - target (creating) file is whiteout-ed on the upper readonly branch, - then the result of the policy is ignored and the target file will be - created on the nearest upper writable branch than the readonly branch. -- If there is a writable branch above the policy-selected branch and - the parent dir is marked as opaque or the target file is whiteouted - on the branch, then the result of the policy is ignored and the target - file will be created on the highest one among the upper writable - branches who has diropq or whiteout. In case of whiteout, aufs removes - it as usual. -- link(2) and rename(2) systemcalls are exceptions in every policy. - They try selecting the branch where the source exists as possible - since copyup a large file will take long time. If it can't be, - ie. the branch where the source exists is readonly, then they will - follow the copyup policy. -- There is an exception for rename(2) when the target exists. - If the rename target exists, aufs compares the index of the branches - where the source and the target exists and selects the higher - one. If the selected branch is readonly, then aufs follows the - copyup policy. diff --git a/Documentation/filesystems/aufs/design/06fhsm.txt b/Documentation/filesystems/aufs/design/06fhsm.txt deleted file mode 100644 index 84b46dc5b..000000000 --- a/Documentation/filesystems/aufs/design/06fhsm.txt +++ /dev/null @@ -1,105 +0,0 @@ - -# Copyright (C) 2011-2016 Junjiro R. Okajima - -File-based Hierarchical Storage Management (FHSM) ----------------------------------------------------------------------- -Hierarchical Storage Management (or HSM) is a well-known feature in the -storage world. Aufs provides this feature as file-based with multiple -writable branches, based upon the principle of "Colder, the Lower". -Here the word "colder" means that the less used files, and "lower" means -that the position in the order of the stacked branches vertically. -These multiple writable branches are prioritized, ie. the topmost one -should be the fastest drive and be used heavily. - -o Characters in aufs FHSM story -- aufs itself and a new branch attribute. -- a new ioctl interface to move-down and to establish a connection with - the daemon ("move-down" is a converse of "copy-up"). -- userspace tool and daemon. - -The userspace daemon establishes a connection with aufs and waits for -the notification. The notified information is very similar to struct -statfs containing the number of consumed blocks and inodes. -When the consumed blocks/inodes of a branch exceeds the user-specified -upper watermark, the daemon activates its move-down process until the -consumed blocks/inodes reaches the user-specified lower watermark. - -The actual move-down is done by aufs based upon the request from -user-space since we need to maintain the inode number and the internal -pointer arrays in aufs. - -Currently aufs FHSM handles the regular files only. Additionally they -must not be hard-linked nor pseudo-linked. - - -o Cowork of aufs and the user-space daemon - During the userspace daemon established the connection, aufs sends a - small notification to it whenever aufs writes something into the - writable branch. But it may cost high since aufs issues statfs(2) - internally. So user can specify a new option to cache the - info. Actually the notification is controlled by these factors. - + the specified cache time. - + classified as "force" by aufs internally. - Until the specified time expires, aufs doesn't send the info - except the forced cases. When aufs decide forcing, the info is always - notified to userspace. - For example, the number of free inodes is generally large enough and - the shortage of it happens rarely. So aufs doesn't force the - notification when creating a new file, directory and others. This is - the typical case which aufs doesn't force. - When aufs writes the actual filedata and the files consumes any of new - blocks, the aufs forces notifying. - - -o Interfaces in aufs -- New branch attribute. - + fhsm - Specifies that the branch is managed by FHSM feature. In other word, - participant in the FHSM. - When nofhsm is set to the branch, it will not be the source/target - branch of the move-down operation. This attribute is set - independently from coo and moo attributes, and if you want full - FHSM, you should specify them as well. -- New mount option. - + fhsm_sec - Specifies a second to suppress many less important info to be - notified. -- New ioctl. - + AUFS_CTL_FHSM_FD - create a new file descriptor which userspace can read the notification - (a subset of struct statfs) from aufs. -- Module parameter 'brs' - It has to be set to 1. Otherwise the new mount option 'fhsm' will not - be set. -- mount helpers /sbin/mount.aufs and /sbin/umount.aufs - When there are two or more branches with fhsm attributes, - /sbin/mount.aufs invokes the user-space daemon and /sbin/umount.aufs - terminates it. As a result of remounting and branch-manipulation, the - number of branches with fhsm attribute can be one. In this case, - /sbin/mount.aufs will terminate the user-space daemon. - - -Finally the operation is done as these steps in kernel-space. -- make sure that, - + no one else is using the file. - + the file is not hard-linked. - + the file is not pseudo-linked. - + the file is a regular file. - + the parent dir is not opaqued. -- find the target writable branch. -- make sure the file is not whiteout-ed by the upper (than the target) - branch. -- make the parent dir on the target branch. -- mutex lock the inode on the branch. -- unlink the whiteout on the target branch (if exists). -- lookup and create the whiteout-ed temporary name on the target branch. -- copy the file as the whiteout-ed temporary name on the target branch. -- rename the whiteout-ed temporary name to the original name. -- unlink the file on the source branch. -- maintain the internal pointer array and the external inode number - table (XINO). -- maintain the timestamps and other attributes of the parent dir and the - file. - -And of course, in every step, an error may happen. So the operation -should restore the original file state after an error happens. diff --git a/Documentation/filesystems/aufs/design/06mmap.txt b/Documentation/filesystems/aufs/design/06mmap.txt deleted file mode 100644 index 991c0b1fa..000000000 --- a/Documentation/filesystems/aufs/design/06mmap.txt +++ /dev/null @@ -1,59 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -mmap(2) -- File Memory Mapping ----------------------------------------------------------------------- -In aufs, the file-mapped pages are handled by a branch fs directly, no -interaction with aufs. It means aufs_mmap() calls the branch fs's -->mmap(). -This approach is simple and good, but there is one problem. -Under /proc, several entries show the mmapped files by its path (with -device and inode number), and the printed path will be the path on the -branch fs's instead of virtual aufs's. -This is not a problem in most cases, but some utilities lsof(1) (and its -user) may expect the path on aufs. - -To address this issue, aufs adds a new member called vm_prfile in struct -vm_area_struct (and struct vm_region). The original vm_file points to -the file on the branch fs in order to handle everything correctly as -usual. The new vm_prfile points to a virtual file in aufs, and the -show-functions in procfs refers to vm_prfile if it is set. -Also we need to maintain several other places where touching vm_file -such like -- fork()/clone() copies vma and the reference count of vm_file is - incremented. -- merging vma maintains the ref count too. - -This is not a good approach. It just fakes the printed path. But it -leaves all behaviour around f_mapping unchanged. This is surely an -advantage. -Actually aufs had adopted another complicated approach which calls -generic_file_mmap() and handles struct vm_operations_struct. In this -approach, aufs met a hard problem and I could not solve it without -switching the approach. - -There may be one more another approach which is -- bind-mount the branch-root onto the aufs-root internally -- grab the new vfsmount (ie. struct mount) -- lazy-umount the branch-root internally -- in open(2) the aufs-file, open the branch-file with the hidden - vfsmount (instead of the original branch's vfsmount) -- ideally this "bind-mount and lazy-umount" should be done atomically, - but it may be possible from userspace by the mount helper. - -Adding the internal hidden vfsmount and using it in opening a file, the -file path under /proc will be printed correctly. This approach looks -smarter, but is not possible I am afraid. -- aufs-root may be bind-mount later. when it happens, another hidden - vfsmount will be required. -- it is hard to get the chance to bind-mount and lazy-umount - + in kernel-space, FS can have vfsmount in open(2) via - file->f_path, and aufs can know its vfsmount. But several locks are - already acquired, and if aufs tries to bind-mount and lazy-umount - here, then it may cause a deadlock. - + in user-space, bind-mount doesn't invoke the mount helper. -- since /proc shows dev and ino, aufs has to give vma these info. it - means a new member vm_prinode will be necessary. this is essentially - equivalent to vm_prfile described above. - -I have to give up this "looks-smater" approach. diff --git a/Documentation/filesystems/aufs/design/06xattr.txt b/Documentation/filesystems/aufs/design/06xattr.txt deleted file mode 100644 index 7bfa94f7b..000000000 --- a/Documentation/filesystems/aufs/design/06xattr.txt +++ /dev/null @@ -1,81 +0,0 @@ - -# Copyright (C) 2014-2016 Junjiro R. Okajima - -Listing XATTR/EA and getting the value ----------------------------------------------------------------------- -For the inode standard attributes (owner, group, timestamps, etc.), aufs -shows the values from the topmost existing file. This behaviour is good -for the non-dir entries since the bahaviour exactly matches the shown -information. But for the directories, aufs considers all the same named -entries on the lower branches. Which means, if one of the lower entry -rejects readdir call, then aufs returns an error even if the topmost -entry allows it. This behaviour is necessary to respect the branch fs's -security, but can make users confused since the user-visible standard -attributes don't match the behaviour. -To address this issue, aufs has a mount option called dirperm1 which -checks the permission for the topmost entry only, and ignores the lower -entry's permission. - -A similar issue can happen around XATTR. -getxattr(2) and listxattr(2) families behave as if dirperm1 option is -always set. Otherwise these very unpleasant situation would happen. -- listxattr(2) may return the duplicated entries. -- users may not be able to remove or reset the XATTR forever, - - -XATTR/EA support in the internal (copy,move)-(up,down) ----------------------------------------------------------------------- -Generally the extended attributes of inode are categorized as these. -- "security" for LSM and capability. -- "system" for posix ACL, 'acl' mount option is required for the branch - fs generally. -- "trusted" for userspace, CAP_SYS_ADMIN is required. -- "user" for userspace, 'user_xattr' mount option is required for the - branch fs generally. - -Moreover there are some other categories. Aufs handles these rather -unpopular categories as the ordinary ones, ie. there is no special -condition nor exception. - -In copy-up, the support for XATTR on the dst branch may differ from the -src branch. In this case, the copy-up operation will get an error and -the original user operation which triggered the copy-up will fail. It -can happen that even all copy-up will fail. -When both of src and dst branches support XATTR and if an error occurs -during copying XATTR, then the copy-up should fail obviously. That is a -good reason and aufs should return an error to userspace. But when only -the src branch support that XATTR, aufs should not return an error. -For example, the src branch supports ACL but the dst branch doesn't -because the dst branch may natively un-support it or temporary -un-support it due to "noacl" mount option. Of course, the dst branch fs -may NOT return an error even if the XATTR is not supported. It is -totally up to the branch fs. - -Anyway when the aufs internal copy-up gets an error from the dst branch -fs, then aufs tries removing the just copied entry and returns the error -to the userspace. The worst case of this situation will be all copy-up -will fail. - -For the copy-up operation, there two basic approaches. -- copy the specified XATTR only (by category above), and return the - error unconditionally if it happens. -- copy all XATTR, and ignore the error on the specified category only. - -In order to support XATTR and to implement the correct behaviour, aufs -chooses the latter approach and introduces some new branch attributes, -"icexsec", "icexsys", "icextr", "icexusr", and "icexoth". -They correspond to the XATTR namespaces (see above). Additionally, to be -convenient, "icex" is also provided which means all "icex*" attributes -are set (here the word "icex" stands for "ignore copy-error on XATTR"). - -The meaning of these attributes is to ignore the error from setting -XATTR on that branch. -Note that aufs tries copying all XATTR unconditionally, and ignores the -error from the dst branch according to the specified attributes. - -Some XATTR may have its default value. The default value may come from -the parent dir or the environment. If the default value is set at the -file creating-time, it will be overwritten by copy-up. -Some contradiction may happen I am afraid. -Do we need another attribute to stop copying XATTR? I am unsure. For -now, aufs implements the branch attributes to ignore the error. diff --git a/Documentation/filesystems/aufs/design/07export.txt b/Documentation/filesystems/aufs/design/07export.txt deleted file mode 100644 index c23930b49..000000000 --- a/Documentation/filesystems/aufs/design/07export.txt +++ /dev/null @@ -1,45 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Export Aufs via NFS ----------------------------------------------------------------------- -Here is an approach. -- like xino/xib, add a new file 'xigen' which stores aufs inode - generation. -- iget_locked(): initialize aufs inode generation for a new inode, and - store it in xigen file. -- destroy_inode(): increment aufs inode generation and store it in xigen - file. it is necessary even if it is not unlinked, because any data of - inode may be changed by UDBA. -- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise - build file handle by - + branch id (4 bytes) - + superblock generation (4 bytes) - + inode number (4 or 8 bytes) - + parent dir inode number (4 or 8 bytes) - + inode generation (4 bytes)) - + return value of exportfs_encode_fh() for the parent on a branch (4 - bytes) - + file handle for a branch (by exportfs_encode_fh()) -- fh_to_dentry(): - + find the index of a branch from its id in handle, and check it is - still exist in aufs. - + 1st level: get the inode number from handle and search it in cache. - + 2nd level: if not found in cache, get the parent inode number from - the handle and search it in cache. and then open the found parent - dir, find the matching inode number by vfs_readdir() and get its - name, and call lookup_one_len() for the target dentry. - + 3rd level: if the parent dir is not cached, call - exportfs_decode_fh() for a branch and get the parent on a branch, - build a pathname of it, convert it a pathname in aufs, call - path_lookup(). now aufs gets a parent dir dentry, then handle it as - the 2nd level. - + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount - for every branch, but not itself. to get this, (currently) aufs - searches in current->nsproxy->mnt_ns list. it may not be a good - idea, but I didn't get other approach. - + test the generation of the gotten inode. -- every inode operation: they may get EBUSY due to UDBA. in this case, - convert it into ESTALE for NFSD. -- readdir(): call lockdep_on/off() because filldir in NFSD calls - lookup_one_len(), vfs_getattr(), encode_fh() and others. diff --git a/Documentation/filesystems/aufs/design/08shwh.txt b/Documentation/filesystems/aufs/design/08shwh.txt deleted file mode 100644 index ad58ebe15..000000000 --- a/Documentation/filesystems/aufs/design/08shwh.txt +++ /dev/null @@ -1,39 +0,0 @@ - -# Copyright (C) 2005-2016 Junjiro R. Okajima - -Show Whiteout Mode (shwh) ----------------------------------------------------------------------- -Generally aufs hides the name of whiteouts. But in some cases, to show -them is very useful for users. For instance, creating a new middle layer -(branch) by merging existing layers. - -(borrowing aufs1 HOW-TO from a user, Michael Towers) -When you have three branches, -- Bottom: 'system', squashfs (underlying base system), read-only -- Middle: 'mods', squashfs, read-only -- Top: 'overlay', ram (tmpfs), read-write - -The top layer is loaded at boot time and saved at shutdown, to preserve -the changes made to the system during the session. -When larger changes have been made, or smaller changes have accumulated, -the size of the saved top layer data grows. At this point, it would be -nice to be able to merge the two overlay branches ('mods' and 'overlay') -and rewrite the 'mods' squashfs, clearing the top layer and thus -restoring save and load speed. - -This merging is simplified by the use of another aufs mount, of just the -two overlay branches using the 'shwh' option. -# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \ - aufs /livesys/merge_union - -A merged view of these two branches is then available at -/livesys/merge_union, and the new feature is that the whiteouts are -visible! -Note that in 'shwh' mode the aufs mount must be 'ro', which will disable -writing to all branches. Also the default mode for all branches is 'ro'. -It is now possible to save the combined contents of the two overlay -branches to a new squashfs, e.g.: -# mksquashfs /livesys/merge_union /path/to/newmods.squash - -This new squashfs archive can be stored on the boot device and the -initramfs will use it to replace the old one at the next boot. diff --git a/Documentation/filesystems/aufs/design/10dynop.txt b/Documentation/filesystems/aufs/design/10dynop.txt deleted file mode 100644 index 49afc5899..000000000 --- a/Documentation/filesystems/aufs/design/10dynop.txt +++ /dev/null @@ -1,34 +0,0 @@ - -# Copyright (C) 2010-2016 Junjiro R. Okajima - -Dynamically customizable FS operations ----------------------------------------------------------------------- -Generally FS operations (struct inode_operations, struct -address_space_operations, struct file_operations, etc.) are defined as -"static const", but it never means that FS have only one set of -operation. Some FS have multiple sets of them. For instance, ext2 has -three sets, one for XIP, for NOBH, and for normal. -Since aufs overrides and redirects these operations, sometimes aufs has -to change its behaviour according to the branch FS type. More importantly -VFS acts differently if a function (member in the struct) is set or -not. It means aufs should have several sets of operations and select one -among them according to the branch FS definition. - -In order to solve this problem and not to affect the behaviour of VFS, -aufs defines these operations dynamically. For instance, aufs defines -dummy direct_IO function for struct address_space_operations, but it may -not be set to the address_space_operations actually. When the branch FS -doesn't have it, aufs doesn't set it to its address_space_operations -while the function definition itself is still alive. So the behaviour -itself will not change, and it will return an error when direct_IO is -not set. - -The lifetime of these dynamically generated operation object is -maintained by aufs branch object. When the branch is removed from aufs, -the reference counter of the object is decremented. When it reaches -zero, the dynamically generated operation object will be freed. - -This approach is designed to support AIO (io_submit), Direct I/O and -XIP (DAX) mainly. -Currently this approach is applied to address_space_operations for -regular files only. diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index af68efdbb..e5fe521ee 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -51,15 +51,27 @@ configfs tree is always there, whether mounted on /config or not. An item is created via mkdir(2). The item's attributes will also appear at this time. readdir(3) can determine what the attributes are, read(2) can query their default values, and write(2) can store new -values. Like sysfs, attributes should be ASCII text files, preferably -with only one value per file. The same efficiency caveats from sysfs -apply. Don't mix more than one attribute in one attribute file. - -Like sysfs, configfs expects write(2) to store the entire buffer at -once. When writing to configfs attributes, userspace processes should -first read the entire file, modify the portions they wish to change, and -then write the entire buffer back. Attribute files have a maximum size -of one page (PAGE_SIZE, 4096 on i386). +values. Don't mix more than one attribute in one attribute file. + +There are two types of configfs attributes: + +* Normal attributes, which similar to sysfs attributes, are small ASCII text +files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably +only one value per file should be used, and the same caveats from sysfs apply. +Configfs expects write(2) to store the entire buffer at once. When writing to +normal configfs attributes, userspace processes should first read the entire +file, modify the portions they wish to change, and then write the entire +buffer back. + +* Binary attributes, which are somewhat similar to sysfs binary attributes, +but with a few slight changes to semantics. The PAGE_SIZE limitation does not +apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. +The write(2) calls from user space are buffered, and the attributes' +write_bin_attribute method will be invoked on the final close, therefore it is +imperative for user-space to check the return code of close(2) in order to +verify that the operation finished successfully. +To avoid a malicious user OOMing the kernel, there's a per-binary attribute +maximum buffer value. When an item needs to be destroyed, remove it with rmdir(2). An item cannot be destroyed if any other item has a link to it (via @@ -171,6 +183,7 @@ among other things. For that, it needs a type. struct configfs_item_operations *ct_item_ops; struct configfs_group_operations *ct_group_ops; struct configfs_attribute **ct_attrs; + struct configfs_bin_attribute **ct_bin_attrs; }; The most basic function of a config_item_type is to define what @@ -201,6 +214,32 @@ be called whenever userspace asks for a read(2) on the attribute. If an attribute is writable and provides a ->store method, that method will be be called whenever userspace asks for a write(2) on the attribute. +[struct configfs_bin_attribute] + + struct configfs_attribute { + struct configfs_attribute cb_attr; + void *cb_private; + size_t cb_max_size; + }; + +The binary attribute is used when the one needs to use binary blob to +appear as the contents of a file in the item's configfs directory. +To do so add the binary attribute to the NULL-terminated array +config_item_type->ct_bin_attrs, and the item appears in configfs, the +attribute file will appear with the configfs_bin_attribute->cb_attr.ca_name +filename. configfs_bin_attribute->cb_attr.ca_mode specifies the file +permissions. +The cb_private member is provided for use by the driver, while the +cb_max_size member specifies the maximum amount of vmalloc buffer +to be used. + +If binary attribute is readable and the config_item provides a +ct_item_ops->read_bin_attribute() method, that method will be called +whenever userspace asks for a read(2) on the attribute. The converse +will happen for write(2). The reads/writes are bufferred so only a +single read/write will occur; the attributes' need not concern itself +with it. + [struct config_group] A config_item cannot live in a vacuum. The only way one can be created diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b102b4365..e1c9f0849 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,7 +102,7 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. If background_gc=sync, it will turn + will be turned off. If background_gc=sync, it will turn on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. @@ -145,10 +145,12 @@ extent_cache Enable an extent cache based on rb-tree, it can cache as many as extent which map between contiguous logical address and physical address per inode, resulting in increasing the cache hit ratio. Set by default. -noextent_cache Diable an extent cache based on rb-tree explicitly, see +noextent_cache Disable an extent cache based on rb-tree explicitly, see the above extent_cache mount option. noinline_data Disable the inline data feature, inline data feature is enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. ================================================================================ DEBUGFS ENTRIES @@ -192,7 +194,7 @@ Files in /sys/fs/f2fs/ policy for garbage collection. Setting gc_idle = 0 (default) will disable this option. Setting gc_idle = 1 will select the Cost Benefit approach - & setting gc_idle = 2 will select the greedy aproach. + & setting gc_idle = 2 will select the greedy approach. reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree @@ -298,7 +300,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to file. Each file is dump_ssa and dump_sit. The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. -It shows on-disk inode information reconized by a given inode number, and is +It shows on-disk inode information recognized by a given inode number, and is able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and ./dump_sit respectively. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index f24d1b833..f1b87d8aa 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -504,3 +504,24 @@ in your dentry operations instead. [mandatory] __fd_install() & fd_install() can now sleep. Callers should not hold a spinlock or other resources that do not allow a schedule. +-- +[mandatory] + any symlink that might use page_follow_link_light/page_put_link() must + have inode_nohighmem(inode) called before anything might start playing with + its pagecache. No highmem pages should end up in the pagecache of such + symlinks. That includes any preseeding that might be done during symlink + creation. __page_symlink() will honour the mapping gfp flags, so once + you've done inode_nohighmem() it's safe to use, but if you allocate and + insert the page manually, make sure to use the right gfp flags. +-- +[mandatory] + ->follow_link() is replaced with ->get_link(); same API, except that + * ->get_link() gets inode as a separate argument + * ->get_link() may be called in RCU mode - in that case NULL + dentry is passed +-- +[mandatory] + ->get_link() gets struct delayed_call *done now, and should do + set_delayed_call() where it used to set *cookie. + ->put_link() is gone - just give the destructor to set_delayed_call() + in ->get_link(). diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 402ab99e4..843b045b4 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -169,6 +169,9 @@ read the file /proc/PID/status: VmLck: 0 kB VmHWM: 476 kB VmRSS: 476 kB + RssAnon: 352 kB + RssFile: 120 kB + RssShmem: 4 kB VmData: 156 kB VmStk: 88 kB VmExe: 68 kB @@ -231,14 +234,20 @@ Table 1-2: Contents of the status files (as of 4.1) VmSize total program size VmLck locked memory size VmHWM peak resident set size ("high water mark") - VmRSS size of memory portions - VmData size of data, stack, and text segments - VmStk size of data, stack, and text segments + VmRSS size of memory portions. It contains the three + following parts (VmRSS = RssAnon + RssFile + RssShmem) + RssAnon size of resident anonymous memory + RssFile size of resident file mappings + RssShmem size of resident shmem memory (includes SysV shm, + mapping of tmpfs and shared anonymous mappings) + VmData size of private data segments + VmStk size of stack segments VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries VmPMD size of second level page tables - VmSwap size of swap usage (the number of referred swapents) + VmSwap amount of swap used by anonymous private data + (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions Threads number of threads SigQ number of signals queued/max. number for queue @@ -265,7 +274,8 @@ Table 1-3: Contents of the statm files (as of 2.6.8-rc3) Field Content size total program size (pages) (same as VmSize in status) resident size of memory portions (pages) (same as VmRSS in status) - shared number of pages that are shared (i.e. backed by a file) + shared number of pages that are shared (i.e. backed by a file, same + as RssFile+RssShmem in status) trs number of pages that are 'code' (not including libs; broken, includes data segment) lrs number of pages of library (always 0 on 2.6) @@ -346,7 +356,7 @@ address perms offset dev inode pathname a7cb1000-a7cb2000 ---p 00000000 00:00 0 a7cb2000-a7eb2000 rw-p 00000000 00:00 0 a7eb2000-a7eb3000 ---p 00000000 00:00 0 -a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] +a7eb3000-a7ed5000 rw-p 00000000 00:00 0 a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 @@ -378,7 +388,6 @@ is not associated with a file: [heap] = the heap of the program [stack] = the stack of the main process - [stack:1001] = the stack of the thread with tid 1001 [vdso] = the "virtual dynamic shared object", the kernel system call handler @@ -386,10 +395,8 @@ is not associated with a file: The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint of the individual tasks of a process. In this file you will see a mapping marked -as [stack] if that task sees it as a stack. This is a key difference from the -content of /proc/PID/maps, where you will see all mappings that are being used -as stack by all of those tasks. Hence, for the example above, the task-level -map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: +as [stack] if that task sees it as a stack. Hence, for the example above, the +task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: 08048000-08049000 r-xp 00000000 03:00 8312 /opt/test 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test @@ -459,7 +466,10 @@ and a page is modified, the file page is replaced by a private anonymous copy. hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. "Swap" shows how much would-be-anonymous memory is also used, but out on swap. -"SwapPss" shows proportional swap share of this mapping. +For shmem mappings, "Swap" includes also the size of the mapped (and not +replaced by copy-on-write) part of the underlying shmem object out on swap. +"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this +does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. "VmFlags" field deserves a separate description. This member represents the kernel @@ -807,7 +817,7 @@ by migrate-type and finishes with details on how many page blocks of each type exist. If min_free_kbytes has been tuned correctly (recommendations made by hugeadm -from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can +from libhugetlbfs https://github.com/libhugetlbfs/libhugetlbfs/), one can make an estimate of the likely number of huge pages that can be allocated at a given point in time. All the "Movable" blocks should be allocatable unless memory has been mlock()'d. Some of the Reclaimable blocks should @@ -842,6 +852,7 @@ Dirty: 968 kB Writeback: 0 kB AnonPages: 861800 kB Mapped: 280372 kB +Shmem: 644 kB Slab: 284364 kB SReclaimable: 159856 kB SUnreclaim: 124508 kB @@ -898,6 +909,7 @@ MemAvailable: An estimate of how much memory is available for starting new AnonPages: Non-file backed pages mapped into userspace page tables AnonHugePages: Non-file backed huge pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries + Shmem: Total memory used by shared memory (shmem) and tmpfs Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index 32a173dd3..e3f4c778e 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt @@ -664,7 +664,7 @@ replicas continue to be exactly same. if one rbind mounts a tree within the same subtree 'n' times the number of mounts created is an exponential function of 'n'. Having unbindable mount can help prune the unneeded bind - mounts. Here is a example. + mounts. Here is an example. step 1: let's say the root tree has just two directories with diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index 98ef55124..d392e1505 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -17,10 +17,10 @@ RAM, where you have to create an ordinary filesystem on top. Ramdisks cannot swap and you do not have the possibility to resize them. Since tmpfs lives completely in the page cache and on swap, all tmpfs -pages currently in memory will show up as cached. It will not show up -as shared or something like that. Further on you can check the actual -RAM+swap use of a tmpfs instance with df(1) and du(1). - +pages will be shown as "Shmem" in /proc/meminfo and "Shared" in +free(1). Notice that these counters also include shared memory +(shmem, see ipcs(1)). The most reliable way to get the count is +using df(1) and du(1). tmpfs has the following uses: diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126ace..223c32171 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block : 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 8c6f07ad3..b02a7d598 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -350,8 +350,8 @@ struct inode_operations { int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - const char *(*follow_link) (struct dentry *, void **); - void (*put_link) (struct inode *, void *); + const char *(*get_link) (struct dentry *, struct inode *, + struct delayed_call *); int (*permission) (struct inode *, int); int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); @@ -434,20 +434,19 @@ otherwise noted. readlink: called by the readlink(2) system call. Only required if you want to support reading symbolic links - follow_link: called by the VFS to follow a symbolic link to the + get_link: called by the VFS to follow a symbolic link to the inode it points to. Only required if you want to support symbolic links. This method returns the symlink body to traverse (and possibly resets the current position with nd_jump_link()). If the body won't go away until the inode is gone, nothing else is needed; if it needs to be otherwise - pinned, the data needed to release whatever we'd grabbed - is to be stored in void * variable passed by address to - follow_link() instance. - - put_link: called by the VFS to release resources allocated by - follow_link(). The cookie stored by follow_link() is passed - to this method as the last parameter; only called when - cookie isn't NULL. + pinned, arrange for its release by having get_link(..., ..., done) + do set_delayed_call(done, destructor, argument). + In that case destructor(argument) will be called once VFS is + done with the body you've returned. + May be called in RCU mode; that is indicated by NULL dentry + argument. If request can't be handled without leaving RCU mode, + have it return ERR_PTR(-ECHILD). permission: called by the VFS to check for access rights on a POSIX-like filesystem. diff --git a/Documentation/gpio/consumer.txt b/Documentation/gpio/consumer.txt index e000502fd..05676fdac 100644 --- a/Documentation/gpio/consumer.txt +++ b/Documentation/gpio/consumer.txt @@ -260,7 +260,7 @@ will be driven low. To summarize: -Function (example) active-low proporty physical line +Function (example) active-low property physical line gpiod_set_raw_value(desc, 0); don't care low gpiod_set_raw_value(desc, 1); don't care high gpiod_set_value(desc, 0); default (active-high) low diff --git a/Documentation/gpio/driver.txt b/Documentation/gpio/driver.txt index 12a61948e..bbeec415f 100644 --- a/Documentation/gpio/driver.txt +++ b/Documentation/gpio/driver.txt @@ -113,8 +113,8 @@ GPIO irqchips usually fall in one of two categories: it will be threaded IRQ handler on -RT and hard IRQ handler on non-RT (for example, see [3]). Know W/A: The generic_handle_irq() is expected to be called with IRQ disabled, - so IRQ core will complain if it will be called from IRQ handler wich is forced - thread. The "fake?" raw lock can be used to W/A this problem: + so IRQ core will complain if it will be called from IRQ handler which is + forced thread. The "fake?" raw lock can be used to W/A this problem: raw_spinlock_t wa_lock; static irqreturn_t omap_gpio_irq_handler(int irq, void *gpiobank) @@ -224,7 +224,7 @@ Real-Time compliance for GPIO IRQ chips --------------------------------------- Any provider of irqchips needs to be carefully tailored to support Real Time -preemption. It is desireable that all irqchips in the GPIO subsystem keep this +preemption. It is desirable that all irqchips in the GPIO subsystem keep this in mind and does the proper testing to assure they are real time-enabled. So, pay attention on above " RT_FULL:" notes, please. The following is a checklist to follow when preparing a driver for real diff --git a/Documentation/gpio/drivers-on-gpio.txt b/Documentation/gpio/drivers-on-gpio.txt index f61213286..14bf95a13 100644 --- a/Documentation/gpio/drivers-on-gpio.txt +++ b/Documentation/gpio/drivers-on-gpio.txt @@ -54,7 +54,7 @@ hardware descriptions such as device tree or ACPI: drivers for the I2C devices on the bus like any other I2C bus driver. - spi_gpio: drivers/spi/spi-gpio.c is used to drive an SPI bus (variable number - of wires, atleast SCK and optionally MISO, MOSI and chip select lines) using + of wires, at least SCK and optionally MISO, MOSI and chip select lines) using GPIO hammering (bitbang). It will appear as any other SPI bus on the system and makes it possible to connect drivers for SPI devices on the bus like any other SPI bus driver. For example any MMC/SD card can then be connected @@ -75,7 +75,7 @@ hardware descriptions such as device tree or ACPI: - gpio-wdt: drivers/watchdog/gpio_wdt.c is used to provide a watchdog timer that will periodically "ping" a hardware connected to a GPIO line by toggling - it from 1-to-0-to-1. If that hardware does not recieve its "ping" + it from 1-to-0-to-1. If that hardware does not receive its "ping" periodically, it will reset the system. - gpio-nand: drivers/mtd/nand/gpio.c is used to connect a NAND flash chip to @@ -91,5 +91,5 @@ usually connected directly to the flash. Use those instead of talking directly to the GPIOs using sysfs; they integrate with kernel frameworks better than your userspace code could. Needless to say, -just using the apropriate kernel drivers will simplify and speed up your +just using the appropriate kernel drivers will simplify and speed up your embedded hacking in particular by providing ready-made components. diff --git a/Documentation/hwmon/htu21 b/Documentation/hwmon/htu21 deleted file mode 100644 index f39a215fb..000000000 --- a/Documentation/hwmon/htu21 +++ /dev/null @@ -1,46 +0,0 @@ -Kernel driver htu21 -=================== - -Supported chips: - * Measurement Specialties HTU21D - Prefix: 'htu21' - Addresses scanned: none - Datasheet: Publicly available at the Measurement Specialties website - http://www.meas-spec.com/downloads/HTU21D.pdf - - -Author: - William Markezana - -Description ------------ - -The HTU21D is a humidity and temperature sensor in a DFN package of -only 3 x 3 mm footprint and 0.9 mm height. - -The devices communicate with the I2C protocol. All sensors are set to the -same I2C address 0x40, so an entry with I2C_BOARD_INFO("htu21", 0x40) can -be used in the board setup code. - -This driver does not auto-detect devices. You will have to instantiate the -devices explicitly. Please see Documentation/i2c/instantiating-devices -for details. - -sysfs-Interface ---------------- - -temp1_input - temperature input -humidity1_input - humidity input - -Notes ------ - -The driver uses the default resolution settings of 12 bit for humidity and 14 -bit for temperature, which results in typical measurement times of 11 ms for -humidity and 44 ms for temperature. To keep self heating below 0.1 degree -Celsius, the device should not be active for more than 10% of the time. For -this reason, the driver performs no more than two measurements per second and -reports cached information if polled more frequently. - -Different resolutions, the on-chip heater, using the CRC checksum and reading -the serial number are not supported yet. diff --git a/Documentation/hwmon/ltc3815 b/Documentation/hwmon/ltc3815 new file mode 100644 index 000000000..eb7db2d13 --- /dev/null +++ b/Documentation/hwmon/ltc3815 @@ -0,0 +1,61 @@ +Kernel driver ltc3815 +===================== + +Supported chips: + * Linear Technology LTC3815 + Prefix: 'ltc3815' + Addresses scanned: - + Datasheet: http://www.linear.com/product/ltc3815 + +Author: Guenter Roeck + + +Description +----------- + +LTC3815 is a Monolithic Synchronous DC/DC Step-Down Converter. + + +Usage Notes +----------- + +This driver does not probe for PMBus devices. You will have to instantiate +devices explicitly. + +Example: the following commands will load the driver for an LTC3815 +at address 0x20 on I2C bus #1: + +# modprobe ltc3815 +# echo ltc3815 0x20 > /sys/bus/i2c/devices/i2c-1/new_device + + +Sysfs attributes +---------------- + +in1_label "vin" +in1_input Measured input voltage. +in1_alarm Input voltage alarm. +in1_highest Highest input voltage. +in1_reset_history Reset input voltage history. + +in2_label "vout1". +in2_input Measured output voltage. +in2_alarm Output voltage alarm. +in2_highest Highest output voltage. +in2_reset_history Reset output voltage history. + +temp1_input Measured chip temperature. +temp1_alarm Temperature alarm. +temp1_highest Highest measured temperature. +temp1_reset_history Reset temperature history. + +curr1_label "iin". +curr1_input Measured input current. +curr1_highest Highest input current. +curr1_reset_history Reset input current history. + +curr2_label "iout1". +curr2_input Measured output current. +curr2_alarm Output current alarm. +curr2_highest Highest output current. +curr2_reset_history Reset output current history. diff --git a/Documentation/iio/iio_configfs.txt b/Documentation/iio/iio_configfs.txt new file mode 100644 index 000000000..f0add35cd --- /dev/null +++ b/Documentation/iio/iio_configfs.txt @@ -0,0 +1,93 @@ +Industrial IIO configfs support + +1. Overview + +Configfs is a filesystem-based manager of kernel objects. IIO uses some +objects that could be easily configured using configfs (e.g.: devices, +triggers). + +See Documentation/filesystems/configfs/configfs.txt for more information +about how configfs works. + +2. Usage + +In order to use configfs support in IIO we need to select it at compile +time via CONFIG_IIO_CONFIGFS config option. + +Then, mount the configfs filesystem (usually under /config directory): + +$ mkdir /config +$ mount -t configfs none /config + +At this point, all default IIO groups will be created and can be accessed +under /config/iio. Next chapters will describe available IIO configuration +objects. + +3. Software triggers + +One of the IIO default configfs groups is the "triggers" group. It is +automagically accessible when the configfs is mounted and can be found +under /config/iio/triggers. + +IIO software triggers implementation offers support for creating multiple +trigger types. A new trigger type is usually implemented as a separate +kernel module following the interface in include/linux/iio/sw_trigger.h: + +/* + * drivers/iio/trigger/iio-trig-sample.c + * sample kernel module implementing a new trigger type + */ +#include + + +static struct iio_sw_trigger *iio_trig_sample_probe(const char *name) +{ + /* + * This allocates and registers an IIO trigger plus other + * trigger type specific initialization. + */ +} + +static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt) +{ + /* + * This undoes the actions in iio_trig_sample_probe + */ +} + +static const struct iio_sw_trigger_ops iio_trig_sample_ops = { + .probe = iio_trig_sample_probe, + .remove = iio_trig_sample_remove, +}; + +static struct iio_sw_trigger_type iio_trig_sample = { + .name = "trig-sample", + .owner = THIS_MODULE, + .ops = &iio_trig_sample_ops, +}; + +module_iio_sw_trigger_driver(iio_trig_sample); + +Each trigger type has its own directory under /config/iio/triggers. Loading +iio-trig-sample module will create 'trig-sample' trigger type directory +/config/iio/triggers/trig-sample. + +We support the following interrupt sources (trigger types): + * hrtimer, uses high resolution timers as interrupt source + +3.1 Hrtimer triggers creation and destruction + +Loading iio-trig-hrtimer module will register hrtimer trigger types allowing +users to create hrtimer triggers under /config/iio/triggers/hrtimer. + +e.g: + +$ mkdir /config/triggers/hrtimer/instance1 +$ rmdir /config/triggers/hrtimer/instance1 + +Each trigger can have one or more attributes specific to the trigger type. + +3.2 "hrtimer" trigger types attributes + +"hrtimer" trigger type doesn't have any configurable attribute from /config dir. +It does introduce the sampling_frequency attribute to trigger directory. diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt index e16785422..4b1f36b6a 100644 --- a/Documentation/infiniband/core_locking.txt +++ b/Documentation/infiniband/core_locking.txt @@ -15,7 +15,6 @@ Sleeping and interrupt context modify_ah query_ah destroy_ah - bind_mw post_send post_recv poll_cq @@ -31,7 +30,6 @@ Sleeping and interrupt context ib_modify_ah ib_query_ah ib_destroy_ah - ib_bind_mw ib_post_send ib_post_recv ib_req_notify_cq diff --git a/Documentation/ioctl/botching-up-ioctls.txt b/Documentation/ioctl/botching-up-ioctls.txt index 45fe78c58..cc30b1479 100644 --- a/Documentation/ioctl/botching-up-ioctls.txt +++ b/Documentation/ioctl/botching-up-ioctls.txt @@ -122,7 +122,7 @@ Time, Waiting and Missing it ---------------------------- GPUs do most everything asynchronously, so we have a need to time operations and -wait for oustanding ones. This is really tricky business; at the moment none of +wait for outstanding ones. This is really tricky business; at the moment none of the ioctls supported by the drm/i915 get this fully right, which means there's still tons more lessons to learn here. @@ -146,7 +146,7 @@ still tons more lessons to learn here. ioctl restartable relative timeouts tend to be too coarse and can indefinitely extend your wait time due to rounding on each restart. Especially if your reference clock is something really slow like the display - frame counter. With a spec laywer hat on this isn't a bug since timeouts can + frame counter. With a spec lawyer hat on this isn't a bug since timeouts can always be extended - but users will surely hate you if their neat animations starts to stutter due to this. @@ -176,7 +176,7 @@ entails its own little set of pitfalls: * Ensure that you have sufficient insulation between different clients. By default pick a private per-fd namespace which forces any sharing to be done - explictly. Only go with a more global per-device namespace if the objects + explicitly. Only go with a more global per-device namespace if the objects are truly device-unique. One counterexample in the drm modeset interfaces is that the per-device modeset objects like connectors share a namespace with framebuffer objects, which mostly are not shared at all. A separate diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO index 5a0f2bdc2..8d5465d3f 100644 --- a/Documentation/ja_JP/HOWTO +++ b/Documentation/ja_JP/HOWTO @@ -245,7 +245,7 @@ Linux カーネルソースツリーの中に含まれる、きれいにし、 自己参照方式で、索引がついた web 形式で、ソースコードを参照することが できます。この最新の素晴しいカーネルコードのリポジトリは以下で見つかり ます- - http://lxr.linux.no/+trees + http://lxr.free-electrons.com/ 開発プロセス ----------------------- @@ -366,7 +366,6 @@ http://patchwork.kernel.org/ でリストされています。 に全サブシステムツリーからほぼ毎日プルされてできる特別なテスト用のリ ポジトリが存在します- http://git.kernel.org/?p=linux/kernel/git/next/linux-next.git - http://linux.f-seidel.de/linux-next/pmwiki/ このやり方によって、-next カーネルは次のマージ機会でどんなものがメイン ラインカーネルにマージされるか、おおまかなの展望を提供します。-next diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt index 08913361e..fe217c1c2 100644 --- a/Documentation/kernel-docs.txt +++ b/Documentation/kernel-docs.txt @@ -631,7 +631,7 @@ between two versions of a file". * Name: "Cross-Referencing Linux" - URL: http://lxr.linux.no/source/ + URL: http://lxr.free-electrons.com/ Keywords: Browsing source code. Description: Another web-based Linux kernel source code browser. Lots of cross references to variables and functions. You can see diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ffad13fdf..76d68c5df 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -472,6 +472,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Change the amount of debugging information output when initialising the APIC and IO-APIC components. + apic_extnmi= [APIC,X86] External NMI delivery setting + Format: { bsp (default) | all | none } + bsp: External NMI is delivered only to CPU 0 + all: External NMIs are broadcast to all CPUs as a + backup of CPU 0 + none: External NMI is masked for all CPUs. This is + useful so that a dump capture kernel won't be + shot down by NMI + autoconf= [IPV6] See Documentation/networking/ipv6.txt. @@ -599,6 +608,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. cut the overhead, others just disable the usage. So only cgroup_disable=memory is actually worthy} + cgroup.memory= [KNL] Pass options to the cgroup memory controller. + Format: + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + checkreqprot [SELINUX] Set initial checkreqprot flag value. Format: { "0" | "1" } See security/selinux/Kconfig help text. @@ -721,16 +735,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. uart[8250],io,[,options] uart[8250],mmio,[,options] + uart[8250],mmio16,[,options] uart[8250],mmio32,[,options] uart[8250],0x[,options] Start an early, polled-mode console on the 8250/16550 UART at the specified I/O port or MMIO address, switching to the matching ttyS device later. MMIO inter-register address stride is either 8-bit - (mmio) or 32-bit (mmio32). - If none of [io|mmio|mmio32], is assumed to be - equivalent to 'mmio'. 'options' are specified in the - same format described for ttyS above; if unspecified, + (mmio), 16-bit (mmio16), or 32-bit (mmio32). + If none of [io|mmio|mmio16|mmio32], is assumed + to be equivalent to 'mmio'. 'options' are specified in + the same format described for ttyS above; if unspecified, the h/w is not re-initialized. hvc Use the hypervisor console device . This is for @@ -1002,10 +1017,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. unspecified, the h/w is not initialized. pl011, + pl011,mmio32, Start an early, polled-mode console on a pl011 serial port at the specified address. The pl011 serial port must already be setup and configured. Options are not - yet supported. + yet supported. If 'mmio32' is specified, then only + the driver will use only 32-bit accessors to read/write + the device registers. msm_serial, Start an early, polled-mode console on an msm serial @@ -1436,6 +1454,41 @@ bytes respectively. Such letter suffixes can also be entirely omitted. In such case C2/C3 won't be used again. idle=nomwait: Disable mwait for CPU C-states + ieee754= [MIPS] Select IEEE Std 754 conformance mode + Format: { strict | legacy | 2008 | relaxed } + Default: strict + + Choose which programs will be accepted for execution + based on the IEEE 754 NaN encoding(s) supported by + the FPU and the NaN encoding requested with the value + of an ELF file header flag individually set by each + binary. Hardware implementations are permitted to + support either or both of the legacy and the 2008 NaN + encoding mode. + + Available settings are as follows: + strict accept binaries that request a NaN encoding + supported by the FPU + legacy only accept legacy-NaN binaries, if supported + by the FPU + 2008 only accept 2008-NaN binaries, if supported + by the FPU + relaxed accept any binaries regardless of whether + supported by the FPU + + The FPU emulator is always able to support both NaN + encodings, so if no FPU hardware is present or it has + been disabled with 'nofpu', then the settings of + 'legacy' and '2008' strap the emulator accordingly, + 'relaxed' straps the emulator for both legacy-NaN and + 2008-NaN, whereas 'strict' enables legacy-NaN only on + legacy processors and both NaN encodings on MIPS32 or + MIPS64 CPUs. + + The setting for ABS.fmt/NEG.fmt instruction execution + mode generally follows that for the NaN encoding, + except where unsupported by hardware. + ignore_loglevel [KNL] Ignore loglevel setting - this will print /all/ kernel messages to the console. Useful for debugging. @@ -1443,6 +1496,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. could change it dynamically, usually by /sys/module/printk/parameters/ignore_loglevel. + ignore_rlimit_data + Ignore RLIMIT_DATA setting for data mappings, + print warning at first misuse. Can be changed via + /sys/module/kernel/parameters/ignore_rlimit_data. + ihash_entries= [KNL] Set number of hash buckets for inode cache. @@ -2575,8 +2633,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. notsc [BUGS=X86-32] Disable Time Stamp Counter - nousb [USB] Disable the USB subsystem - nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). @@ -2733,10 +2789,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. hardware access methods are allowed. Use this if you experience crashes upon bootup and you suspect they are caused by the BIOS. - conf1 [X86] Force use of PCI Configuration - Mechanism 1. - conf2 [X86] Force use of PCI Configuration - Mechanism 2. + conf1 [X86] Force use of PCI Configuration Access + Mechanism 1 (config address in IO port 0xCF8, + data in IO port 0xCFC, both 32-bit). + conf2 [X86] Force use of PCI Configuration Access + Mechanism 2 (IO port 0xCF8 is an 8-bit port for + the function, IO port 0xCFA, also 8-bit, sets + bus number. The config space is then accessed + through ports 0xC000-0xCFFF). + See http://wiki.osdev.org/PCI for more info + on the configuration access mechanisms. noaer [PCIE] If the PCIEAER kernel config parameter is enabled, this kernel boot option can be used to disable the use of PCIE advanced error reporting. @@ -2978,6 +3040,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. may be specified. Format: ,.... + ppc_strict_facility_enable + [PPC] This option catches any kernel floating point, + Altivec, VSX and SPE outside of regions specifically + allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). + There is some performance impact when enabling this. + print-fatal-signals= [KNL] debug: print fatal signals @@ -3050,9 +3118,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. raid= [HW,RAID] See Documentation/md.txt. - ramdisk_blocksize= [RAM] - See Documentation/blockdev/ramdisk.txt. - ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. @@ -3296,18 +3361,35 @@ bytes respectively. Such letter suffixes can also be entirely omitted. rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_cpu_stall_suppress= [KNL] + Suppress RCU CPU stall warning messages. + + rcupdate.rcu_cpu_stall_timeout= [KNL] + Set timeout for RCU CPU stall warning messages. + rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for example, synchronize_rcu_expedited() instead of synchronize_rcu(). This reduces latency, but can increase CPU utilization, degrade real-time latency, and degrade energy efficiency. - - rcupdate.rcu_cpu_stall_suppress= [KNL] - Suppress RCU CPU stall warning messages. - - rcupdate.rcu_cpu_stall_timeout= [KNL] - Set timeout for RCU CPU stall warning messages. + No effect on CONFIG_TINY_RCU kernels. + + rcupdate.rcu_normal= [KNL] + Use only normal grace-period primitives, + for example, synchronize_rcu() instead of + synchronize_rcu_expedited(). This improves + real-time latency, CPU utilization, and + energy efficiency, but can expose users to + increased grace-period latency. This parameter + overrides rcupdate.rcu_expedited. No effect on + CONFIG_TINY_RCU kernels. + + rcupdate.rcu_normal_after_boot= [KNL] + Once boot has completed (that is, after + rcu_end_inkernel_boot() has been invoked), use + only normal grace-period primitives. No effect + on CONFIG_TINY_RCU kernels. rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning @@ -3446,6 +3528,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. sched_debug [KNL] Enables verbose scheduler debug messages. + schedstats= [KNL,X86] Enable or disable scheduled statistics. + Allowed values are enable and disable. This feature + incurs a small amount of overhead in the scheduler + but is useful for debugging and performance tuning. + skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. @@ -3874,6 +3961,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. usbcore.usbfs_snoop= [USB] Set to log all usbfs traffic (default 0 = off). + usbcore.usbfs_snoop_max= + [USB] Maximum number of bytes to snoop in each URB + (default = 65536). + usbcore.blinkenlights= [USB] Set to cycle leds on hubs (default 0 = off). @@ -3894,6 +3985,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. USB_REQ_GET_DESCRIPTOR request in milliseconds (default 5000 = 5.0 seconds). + usbcore.nousb [USB] Disable the USB subsystem + usbhid.mousepoll= [USBHID] The interval which mice are to be polled at. @@ -3966,9 +4059,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. HIGHMEM regardless of setting of CONFIG_HIGHPTE. - uuid_debug= (Boolean) whether to enable debugging of TuxOnIce's - uuid support. - vdso= [X86,SH] On X86_32, this is an alias for vdso32=. Otherwise: @@ -4117,6 +4207,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. or other driver-specific files in the Documentation/watchdog/ directory. + workqueue.watchdog_thresh= + If CONFIG_WQ_WATCHDOG is configured, workqueue can + warn stall conditions and dump internal state to + help debugging. 0 disables workqueue stall + detection; otherwise, it's the stall threshold + duration in seconds. The default value is 30 and + it can be updated at runtime by writing to the + corresponding sysfs file. + workqueue.disable_numa By default, all work items queued to unbound workqueues are affine to the NUMA nodes they're @@ -4141,6 +4240,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. The default value of this parameter is determined by the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + workqueue.debug_force_rr_cpu + Workqueue used to implicitly guarantee that work + items queued without explicit CPU specified are put + on the local CPU. This guarantee is no longer true + and while local CPU is still preferred work items + may be put on foreign CPUs. This debug option + forces round-robin CPU selection to flush out + usages which depend on the now broken guarantee. + When enabled, memory and cache locality will be + impacted. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index f4cbfe0ba..edec3a3e6 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -90,7 +90,7 @@ BLOCK_SOFTIRQ: Do all of the following: from being initiated from tasks that might run on the CPU to be de-jittered. (It is OK to force this CPU offline and then bring it back online before you start your application.) -BLOCK_IOPOLL_SOFTIRQ: Do all of the following: +IRQ_POLL_SOFTIRQ: Do all of the following: 1. Force block-device interrupts onto some other CPU. 2. Initiate any block I/O and block-I/O polling on other CPUs. 3. Once your application has started, prevent CPU-hotplug operations diff --git a/Documentation/ko_KR/HOWTO b/Documentation/ko_KR/HOWTO index dc2ff8f61..1aef53e6c 100644 --- a/Documentation/ko_KR/HOWTO +++ b/Documentation/ko_KR/HOWTO @@ -213,7 +213,7 @@ Documentation/DocBook/ 디렉토리 내에서 만들어지며 PDF, Postscript, H 것은 Linux Cross-Reference project이며 그것은 자기 참조 방식이며 소스코드를 인덱스된 웹 페이지들의 형태로 보여준다. 최신의 멋진 커널 코드 저장소는 다음을 통하여 참조할 수 있다. - http://lxr.linux.no/+trees + http://lxr.free-electrons.com/ 개발 프로세스 @@ -222,16 +222,16 @@ Documentation/DocBook/ 디렉토리 내에서 만들어지며 PDF, Postscript, H 리눅스 커널 개발 프로세스는 현재 몇몇 다른 메인 커널 "브랜치들"과 서브시스템에 특화된 커널 브랜치들로 구성된다. 몇몇 다른 메인 브랜치들은 다음과 같다. - - main 3.x 커널 트리 - - 3.x.y - 안정된 커널 트리 - - 3.x -git 커널 패치들 + - main 4.x 커널 트리 + - 4.x.y - 안정된 커널 트리 + - 4.x -git 커널 패치들 - 서브시스템을 위한 커널 트리들과 패치들 - - 3.x - 통합 테스트를 위한 next 커널 트리 + - 4.x - 통합 테스트를 위한 next 커널 트리 -3.x 커널 트리 +4.x 커널 트리 --------------- -3.x 커널들은 Linux Torvalds가 관리하며 kernel.org의 pub/linux/kernel/v3.x/ +4.x 커널들은 Linux Torvalds가 관리하며 kernel.org의 pub/linux/kernel/v4.x/ 디렉토리에서 참조될 수 있다.개발 프로세스는 다음과 같다. - 새로운 커널이 배포되자마자 2주의 시간이 주어진다. 이 기간동은 메인테이너들은 큰 diff들을 Linus에게 제출할 수 있다. 대개 이 패치들은 @@ -262,20 +262,20 @@ Andrew Morton의 글이 있다. 버그의 상황에 따라 배포되는 것이지 미리정해 놓은 시간에 따라 배포되는 것은 아니기 때문이다." -3.x.y - 안정 커널 트리 +4.x.y - 안정 커널 트리 ------------------------ -3 자리 숫자로 이루어진 버젼의 커널들은 -stable 커널들이다. 그것들은 3.x +3 자리 숫자로 이루어진 버젼의 커널들은 -stable 커널들이다. 그것들은 4.x 커널에서 발견된 큰 회귀들이나 보안 문제들 중 비교적 작고 중요한 수정들을 포함한다. 이것은 가장 최근의 안정적인 커널을 원하는 사용자에게 추천되는 브랜치이며, 개발/실험적 버젼을 테스트하는 것을 돕고자 하는 사용자들과는 별로 관련이 없다. -어떤 3.x.y 커널도 사용할 수 없다면 그때는 가장 높은 숫자의 3.x +어떤 4.x.y 커널도 사용할 수 없다면 그때는 가장 높은 숫자의 4.x 커널이 현재의 안정 커널이다. -3.x.y는 "stable" 팀에 의해 관리되며 거의 매번 격주로 +4.x.y는 "stable" 팀에 의해 관리되며 거의 매번 격주로 배포된다. 커널 트리 문서들 내에 Documentation/stable_kernel_rules.txt 파일은 어떤 @@ -283,7 +283,7 @@ Andrew Morton의 글이 있다. 진행되는지를 설명한다. -3.x -git 패치들 +4.x -git 패치들 ------------------ git 저장소(그러므로 -git이라는 이름이 붙음)에는 날마다 관리되는 Linus의 커널 트리의 snapshot 들이 있다. 이 패치들은 일반적으로 날마다 배포되며 @@ -312,13 +312,12 @@ Linus의 트리의 현재 상태를 나타낸다. 이 패치들은 정상적인 대부분의 이러한 patchwork 사이트는 http://patchwork.kernel.org/ 또는 http://patchwork.ozlabs.org/ 에 나열되어 있다. -3.x - 통합 테스트를 위한 next 커널 트리 +4.x - 통합 테스트를 위한 next 커널 트리 ----------------------------------------- -서브시스템 트리들의 변경사항들은 mainline 3.x 트리로 들어오기 전에 통합 +서브시스템 트리들의 변경사항들은 mainline 4.x 트리로 들어오기 전에 통합 테스트를 거쳐야 한다. 이런 목적으로, 모든 서브시스템 트리의 변경사항을 거의 매일 받아가는 특수한 테스트 저장소가 존재한다: http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git - http://linux.f-seidel.de/linux-next/pmwiki/ 이런 식으로, -next 커널을 통해 다음 머지 기간에 메인라인 커널에 어떤 변경이 가해질 것인지 간략히 알 수 있다. 모험심 강한 테스터라면 -next 커널에서 테스트를 diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt index 62261c040..d406d9833 100644 --- a/Documentation/leds/leds-class.txt +++ b/Documentation/leds/leds-class.txt @@ -52,6 +52,19 @@ above leaves scope for further attributes should they be needed. If sections of the name don't apply, just leave that section blank. +Brightness setting API +====================== + +LED subsystem core exposes following API for setting brightness: + + - led_set_brightness : it is guaranteed not to sleep, passing LED_OFF stops + blinking, + - led_set_brightness_sync : for use cases when immediate effect is desired - + it can block the caller for the time required for accessing + device registers and can sleep, passing LED_OFF stops hardware + blinking, returns -EBUSY if software blink fallback is enabled. + + Hardware accelerated blink of LEDs ================================== diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt index 1b794369e..c100c7163 100644 --- a/Documentation/md-cluster.txt +++ b/Documentation/md-cluster.txt @@ -3,7 +3,7 @@ The cluster MD is a shared-device RAID for a cluster. 1. On-disk format -Separate write-intent-bitmap are used for each cluster node. +Separate write-intent-bitmaps are used for each cluster node. The bitmaps record all writes that may have been started on that node, and may not yet have finished. The on-disk layout is: @@ -14,117 +14,161 @@ and may not yet have finished. The on-disk layout is: | bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits | | bm bits [3, contd] | | | -During "normal" functioning we assume the filesystem ensures that only one -node writes to any given block at a time, so a write -request will +During "normal" functioning we assume the filesystem ensures that only +one node writes to any given block at a time, so a write request will + - set the appropriate bit (if not already set) - commit the write to all mirrors - schedule the bit to be cleared after a timeout. -Reads are just handled normally. It is up to the filesystem to -ensure one node doesn't read from a location where another node (or the same +Reads are just handled normally. It is up to the filesystem to ensure +one node doesn't read from a location where another node (or the same node) is writing. 2. DLM Locks for management -There are two locks for managing the device: +There are three groups of locks for managing the device: 2.1 Bitmap lock resource (bm_lockres) - The bm_lockres protects individual node bitmaps. They are named in the - form bitmap001 for node 1, bitmap002 for node and so on. When a node - joins the cluster, it acquires the lock in PW mode and it stays so - during the lifetime the node is part of the cluster. The lock resource - number is based on the slot number returned by the DLM subsystem. Since - DLM starts node count from one and bitmap slots start from zero, one is - subtracted from the DLM slot number to arrive at the bitmap slot number. + The bm_lockres protects individual node bitmaps. They are named in + the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a + node joins the cluster, it acquires the lock in PW mode and it stays + so during the lifetime the node is part of the cluster. The lock + resource number is based on the slot number returned by the DLM + subsystem. Since DLM starts node count from one and bitmap slots + start from zero, one is subtracted from the DLM slot number to arrive + at the bitmap slot number. + + The LVB of the bitmap lock for a particular node records the range + of sectors that are being re-synced by that node. No other + node may write to those sectors. This is used when a new nodes + joins the cluster. + +2.2 Message passing locks + + Each node has to communicate with other nodes when starting or ending + resync, and for metadata superblock updates. This communication is + managed through three locks: "token", "message", and "ack", together + with the Lock Value Block (LVB) of one of the "message" lock. + +2.3 new-device management + + A single lock: "no-new-dev" is used to co-ordinate the addition of + new devices - this must be synchronized across the array. + Normally all nodes hold a concurrent-read lock on this device. 3. Communication -Each node has to communicate with other nodes when starting or ending -resync, and metadata superblock updates. + Messages can be broadcast to all nodes, and the sender waits for all + other nodes to acknowledge the message before proceeding. Only one + message can be processed at a time. 3.1 Message Types - There are 3 types, of messages which are passed + There are six types of messages which are passed: - 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been - updated, and the node must re-read the md superblock. This is performed - synchronously. + 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has + been updated, and the node must re-read the md superblock. This is + performed synchronously. It is primarily used to signal device + failure. - 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended - so that each node may suspend or resume the region. + 3.1.2 RESYNCING: informs other nodes that a resync is initiated or + ended so that each node may suspend or resume the region. Each + RESYNCING message identifies a range of the devices that the + sending node is about to resync. This over-rides any pervious + notification from that node: only one ranged can be resynced at a + time per-node. + + 3.1.3 NEWDISK: informs other nodes that a device is being added to + the array. Message contains an identifier for that device. See + below for further details. + + 3.1.4 REMOVE: A failed or spare device is being removed from the + array. The slot-number of the device is included in the message. + + 3.1.5 RE_ADD: A failed device is being re-activated - the assumption + is that it has been determined to be working again. + + 3.1.6 BITMAP_NEEDS_SYNC: if a node is stopped locally but the bitmap + isn't clean, then another node is informed to take the ownership of + resync. 3.2 Communication mechanism The DLM LVB is used to communicate within nodes of the cluster. There are three resources used for the purpose: - 3.2.1 Token: The resource which protects the entire communication + 3.2.1 token: The resource which protects the entire communication system. The node having the token resource is allowed to communicate. - 3.2.2 Message: The lock resource which carries the data to + 3.2.2 message: The lock resource which carries the data to communicate. - 3.2.3 Ack: The resource, acquiring which means the message has been + 3.2.3 ack: The resource, acquiring which means the message has been acknowledged by all nodes in the cluster. The BAST of the resource - is used to inform the receive node that a node wants to communicate. + is used to inform the receiving node that a node wants to + communicate. The algorithm is: - 1. receive status + 1. receive status - all nodes have concurrent-reader lock on "ack". - sender receiver receiver - ACK:CR ACK:CR ACK:CR + sender receiver receiver + "ack":CR "ack":CR "ack":CR - 2. sender get EX of TOKEN - sender get EX of MESSAGE + 2. sender get EX on "token" + sender get EX on "message" sender receiver receiver - TOKEN:EX ACK:CR ACK:CR - MESSAGE:EX - ACK:CR + "token":EX "ack":CR "ack":CR + "message":EX + "ack":CR - Sender checks that it still needs to send a message. Messages received - or other events that happened while waiting for the TOKEN may have made - this message inappropriate or redundant. + Sender checks that it still needs to send a message. Messages + received or other events that happened while waiting for the + "token" may have made this message inappropriate or redundant. - 3. sender write LVB. - sender down-convert MESSAGE from EX to CW - sender try to get EX of ACK - [ wait until all receiver has *processed* the MESSAGE ] + 3. sender writes LVB. + sender down-convert "message" from EX to CW + sender try to get EX of "ack" + [ wait until all receivers have *processed* the "message" ] - [ triggered by bast of ACK ] - receiver get CR of MESSAGE + [ triggered by bast of "ack" ] + receiver get CR on "message" receiver read LVB receiver processes the message [ wait finish ] - receiver release ACK - - sender receiver receiver - TOKEN:EX MESSAGE:CR MESSAGE:CR - MESSAGE:CR - ACK:EX - - 4. triggered by grant of EX on ACK (indicating all receivers have processed - message) - sender down-convert ACK from EX to CR - sender release MESSAGE - sender release TOKEN - receiver upconvert to PR of MESSAGE - receiver get CR of ACK - receiver release MESSAGE + receiver releases "ack" + receiver tries to get PR on "message" + + sender receiver receiver + "token":EX "message":CR "message":CR + "message":CW + "ack":EX + + 4. triggered by grant of EX on "ack" (indicating all receivers + have processed message) + sender down-converts "ack" from EX to CR + sender releases "message" + sender releases "token" + receiver upconvert to PR on "message" + receiver get CR of "ack" + receiver release "message" sender receiver receiver - ACK:CR ACK:CR ACK:CR + "ack":CR "ack":CR "ack":CR 4. Handling Failures 4.1 Node Failure - When a node fails, the DLM informs the cluster with the slot. The node - starts a cluster recovery thread. The cluster recovery thread: + + When a node fails, the DLM informs the cluster with the slot + number. The node starts a cluster recovery thread. The cluster + recovery thread: + - acquires the bitmap lock of the failed node - opens the bitmap - reads the bitmap of the failed node @@ -132,45 +176,143 @@ The algorithm is: - cleans the bitmap of the failed node - releases bitmap lock of the failed node - initiates resync of the bitmap on the current node + md_check_recovery is invoked within recover_bitmaps, + then md_check_recovery -> metadata_update_start/finish, + it will lock the communication by lock_comm. + Which means when one node is resyncing it blocks all + other nodes from writing anywhere on the array. - The resync process, is the regular md resync. However, in a clustered + The resync process is the regular md resync. However, in a clustered environment when a resync is performed, it needs to tell other nodes of the areas which are suspended. Before a resync starts, the node - send out RESYNC_START with the (lo,hi) range of the area which needs - to be suspended. Each node maintains a suspend_list, which contains - the list of ranges which are currently suspended. On receiving - RESYNC_START, the node adds the range to the suspend_list. Similarly, - when the node performing resync finishes, it send RESYNC_FINISHED - to other nodes and other nodes remove the corresponding entry from - the suspend_list. + send out RESYNCING with the (lo,hi) range of the area which needs to + be suspended. Each node maintains a suspend_list, which contains the + list of ranges which are currently suspended. On receiving RESYNCING, + the node adds the range to the suspend_list. Similarly, when the node + performing resync finishes, it sends RESYNCING with an empty range to + other nodes and other nodes remove the corresponding entry from the + suspend_list. - A helper function, should_suspend() can be used to check if a particular - I/O range should be suspended or not. + A helper function, ->area_resyncing() can be used to check if a + particular I/O range should be suspended or not. 4.2 Device Failure + Device failures are handled and communicated with the metadata update - routine. + routine. When a node detects a device failure it does not allow + any further writes to that device until the failure has been + acknowledged by all other nodes. 5. Adding a new Device -For adding a new device, it is necessary that all nodes "see" the new device -to be added. For this, the following algorithm is used: + + For adding a new device, it is necessary that all nodes "see" the new + device to be added. For this, the following algorithm is used: 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues - ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) - 2. Node 1 sends NEWDISK with uuid and slot number + ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD) + 2. Node 1 sends a NEWDISK message with uuid and slot number 3. Other nodes issue kobject_uevent_env with uuid and slot number (Steps 4,5 could be a udev rule) 4. In userspace, the node searches for the disk, perhaps using blkid -t SUB_UUID="" - 5. Other nodes issue either of the following depending on whether the disk - was found: + 5. Other nodes issue either of the following depending on whether + the disk was found: ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and - disc.number set to slot number) + disc.number set to slot number) ioctl(CLUSTERED_DISK_NACK) - 6. Other nodes drop lock on no-new-devs (CR) if device is found - 7. Node 1 attempts EX lock on no-new-devs - 8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk - as SpareLocal - 9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED - 10. Other nodes get the information whether a disk is added or not - by the following METADATA_UPDATED. + 6. Other nodes drop lock on "no-new-devs" (CR) if device is found + 7. Node 1 attempts EX lock on "no-new-dev" + 8. If node 1 gets the lock, it sends METADATA_UPDATED after + unmarking the disk as SpareLocal + 9. If not (get "no-new-dev" lock), it fails the operation and sends + METADATA_UPDATED. + 10. Other nodes get the information whether a disk is added or not + by the following METADATA_UPDATED. + +6. Module interface. + + There are 17 call-backs which the md core can make to the cluster + module. Understanding these can give a good overview of the whole + process. + +6.1 join(nodes) and leave() + + These are called when an array is started with a clustered bitmap, + and when the array is stopped. join() ensures the cluster is + available and initializes the various resources. + Only the first 'nodes' nodes in the cluster can use the array. + +6.2 slot_number() + + Reports the slot number advised by the cluster infrastructure. + Range is from 0 to nodes-1. + +6.3 resync_info_update() + + This updates the resync range that is stored in the bitmap lock. + The starting point is updated as the resync progresses. The + end point is always the end of the array. + It does *not* send a RESYNCING message. + +6.4 resync_start(), resync_finish() + + These are called when resync/recovery/reshape starts or stops. + They update the resyncing range in the bitmap lock and also + send a RESYNCING message. resync_start reports the whole + array as resyncing, resync_finish reports none of it. + + resync_finish() also sends a BITMAP_NEEDS_SYNC message which + allows some other node to take over. + +6.5 metadata_update_start(), metadata_update_finish(), + metadata_update_cancel(). + + metadata_update_start is used to get exclusive access to + the metadata. If a change is still needed once that access is + gained, metadata_update_finish() will send a METADATA_UPDATE + message to all other nodes, otherwise metadata_update_cancel() + can be used to release the lock. + +6.6 area_resyncing() + + This combines two elements of functionality. + + Firstly, it will check if any node is currently resyncing + anything in a given range of sectors. If any resync is found, + then the caller will avoid writing or read-balancing in that + range. + + Secondly, while node recovery is happening it reports that + all areas are resyncing for READ requests. This avoids races + between the cluster-filesystem and the cluster-RAID handling + a node failure. + +6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack() + + These are used to manage the new-disk protocol described above. + When a new device is added, add_new_disk_start() is called before + it is bound to the array and, if that succeeds, add_new_disk_finish() + is called the device is fully added. + + When a device is added in acknowledgement to a previous + request, or when the device is declared "unavailable", + new_disk_ack() is called. + +6.8 remove_disk() + + This is called when a spare or failed device is removed from + the array. It causes a REMOVE message to be send to other nodes. + +6.9 gather_bitmaps() + + This sends a RE_ADD message to all other nodes and then + gathers bitmap information from all bitmaps. This combined + bitmap is then used to recovery the re-added device. + +6.10 lock_all_bitmaps() and unlock_all_bitmaps() + + These are called when change bitmap to none. If a node plans + to clear the cluster raid's bitmap, it need to make sure no other + nodes are using the raid which is achieved by lock all bitmap + locks within the cluster, and also those locks are unlocked + accordingly. diff --git a/Documentation/media-framework.txt b/Documentation/media-framework.txt deleted file mode 100644 index f552a75c0..000000000 --- a/Documentation/media-framework.txt +++ /dev/null @@ -1,372 +0,0 @@ -Linux kernel media framework -============================ - -This document describes the Linux kernel media framework, its data structures, -functions and their usage. - - -Introduction ------------- - -The media controller API is documented in DocBook format in -Documentation/DocBook/media/v4l/media-controller.xml. This document will focus -on the kernel-side implementation of the media framework. - - -Abstract media device model ---------------------------- - -Discovering a device internal topology, and configuring it at runtime, is one -of the goals of the media framework. To achieve this, hardware devices are -modelled as an oriented graph of building blocks called entities connected -through pads. - -An entity is a basic media hardware building block. It can correspond to -a large variety of logical blocks such as physical hardware devices -(CMOS sensor for instance), logical hardware devices (a building block -in a System-on-Chip image processing pipeline), DMA channels or physical -connectors. - -A pad is a connection endpoint through which an entity can interact with -other entities. Data (not restricted to video) produced by an entity -flows from the entity's output to one or more entity inputs. Pads should -not be confused with physical pins at chip boundaries. - -A link is a point-to-point oriented connection between two pads, either -on the same entity or on different entities. Data flows from a source -pad to a sink pad. - - -Media device ------------- - -A media device is represented by a struct media_device instance, defined in -include/media/media-device.h. Allocation of the structure is handled by the -media device driver, usually by embedding the media_device instance in a -larger driver-specific structure. - -Drivers register media device instances by calling - - media_device_register(struct media_device *mdev); - -The caller is responsible for initializing the media_device structure before -registration. The following fields must be set: - - - dev must point to the parent device (usually a pci_dev, usb_interface or - platform_device instance). - - - model must be filled with the device model name as a NUL-terminated UTF-8 - string. The device/model revision must not be stored in this field. - -The following fields are optional: - - - serial is a unique serial number stored as a NUL-terminated ASCII string. - The field is big enough to store a GUID in text form. If the hardware - doesn't provide a unique serial number this field must be left empty. - - - bus_info represents the location of the device in the system as a - NUL-terminated ASCII string. For PCI/PCIe devices bus_info must be set to - "PCI:" (or "PCIe:") followed by the value of pci_name(). For USB devices, - the usb_make_path() function must be used. This field is used by - applications to distinguish between otherwise identical devices that don't - provide a serial number. - - - hw_revision is the hardware device revision in a driver-specific format. - When possible the revision should be formatted with the KERNEL_VERSION - macro. - - - driver_version is formatted with the KERNEL_VERSION macro. The version - minor must be incremented when new features are added to the userspace API - without breaking binary compatibility. The version major must be - incremented when binary compatibility is broken. - -Upon successful registration a character device named media[0-9]+ is created. -The device major and minor numbers are dynamic. The model name is exported as -a sysfs attribute. - -Drivers unregister media device instances by calling - - media_device_unregister(struct media_device *mdev); - -Unregistering a media device that hasn't been registered is *NOT* safe. - - -Entities, pads and links ------------------------- - -- Entities - -Entities are represented by a struct media_entity instance, defined in -include/media/media-entity.h. The structure is usually embedded into a -higher-level structure, such as a v4l2_subdev or video_device instance, -although drivers can allocate entities directly. - -Drivers initialize entities by calling - - media_entity_init(struct media_entity *entity, u16 num_pads, - struct media_pad *pads, u16 extra_links); - -The media_entity name, type, flags, revision and group_id fields can be -initialized before or after calling media_entity_init. Entities embedded in -higher-level standard structures can have some of those fields set by the -higher-level framework. - -As the number of pads is known in advance, the pads array is not allocated -dynamically but is managed by the entity driver. Most drivers will embed the -pads array in a driver-specific structure, avoiding dynamic allocation. - -Drivers must set the direction of every pad in the pads array before calling -media_entity_init. The function will initialize the other pads fields. - -Unlike the number of pads, the total number of links isn't always known in -advance by the entity driver. As an initial estimate, media_entity_init -pre-allocates a number of links equal to the number of pads plus an optional -number of extra links. The links array will be reallocated if it grows beyond -the initial estimate. - -Drivers register entities with a media device by calling - - media_device_register_entity(struct media_device *mdev, - struct media_entity *entity); - -Entities are identified by a unique positive integer ID. Drivers can provide an -ID by filling the media_entity id field prior to registration, or request the -media controller framework to assign an ID automatically. Drivers that provide -IDs manually must ensure that all IDs are unique. IDs are not guaranteed to be -contiguous even when they are all assigned automatically by the framework. - -Drivers unregister entities by calling - - media_device_unregister_entity(struct media_entity *entity); - -Unregistering an entity will not change the IDs of the other entities, and the -ID will never be reused for a newly registered entity. - -When a media device is unregistered, all its entities are unregistered -automatically. No manual entities unregistration is then required. - -Drivers free resources associated with an entity by calling - - media_entity_cleanup(struct media_entity *entity); - -This function must be called during the cleanup phase after unregistering the -entity. Note that the media_entity instance itself must be freed explicitly by -the driver if required. - -Entities have flags that describe the entity capabilities and state. - - MEDIA_ENT_FL_DEFAULT indicates the default entity for a given type. - This can be used to report the default audio and video devices or the - default camera sensor. - -Logical entity groups can be defined by setting the group ID of all member -entities to the same non-zero value. An entity group serves no purpose in the -kernel, but is reported to userspace during entities enumeration. The group_id -field belongs to the media device driver and must not by touched by entity -drivers. - -Media device drivers should define groups if several entities are logically -bound together. Example usages include reporting - - - ALSA, VBI and video nodes that carry the same media stream - - lens and flash controllers associated with a sensor - -- Pads - -Pads are represented by a struct media_pad instance, defined in -include/media/media-entity.h. Each entity stores its pads in a pads array -managed by the entity driver. Drivers usually embed the array in a -driver-specific structure. - -Pads are identified by their entity and their 0-based index in the pads array. -Both information are stored in the media_pad structure, making the media_pad -pointer the canonical way to store and pass link references. - -Pads have flags that describe the pad capabilities and state. - - MEDIA_PAD_FL_SINK indicates that the pad supports sinking data. - MEDIA_PAD_FL_SOURCE indicates that the pad supports sourcing data. - -One and only one of MEDIA_PAD_FL_SINK and MEDIA_PAD_FL_SOURCE must be set for -each pad. - -- Links - -Links are represented by a struct media_link instance, defined in -include/media/media-entity.h. Each entity stores all links originating at or -targeting any of its pads in a links array. A given link is thus stored -twice, once in the source entity and once in the target entity. The array is -pre-allocated and grows dynamically as needed. - -Drivers create links by calling - - media_entity_create_link(struct media_entity *source, u16 source_pad, - struct media_entity *sink, u16 sink_pad, - u32 flags); - -An entry in the link array of each entity is allocated and stores pointers -to source and sink pads. - -Links have flags that describe the link capabilities and state. - - MEDIA_LNK_FL_ENABLED indicates that the link is enabled and can be used - to transfer media data. When two or more links target a sink pad, only - one of them can be enabled at a time. - MEDIA_LNK_FL_IMMUTABLE indicates that the link enabled state can't be - modified at runtime. If MEDIA_LNK_FL_IMMUTABLE is set, then - MEDIA_LNK_FL_ENABLED must also be set since an immutable link is always - enabled. - - -Graph traversal ---------------- - -The media framework provides APIs to iterate over entities in a graph. - -To iterate over all entities belonging to a media device, drivers can use the -media_device_for_each_entity macro, defined in include/media/media-device.h. - - struct media_entity *entity; - - media_device_for_each_entity(entity, mdev) { - /* entity will point to each entity in turn */ - ... - } - -Drivers might also need to iterate over all entities in a graph that can be -reached only through enabled links starting at a given entity. The media -framework provides a depth-first graph traversal API for that purpose. - -Note that graphs with cycles (whether directed or undirected) are *NOT* -supported by the graph traversal API. To prevent infinite loops, the graph -traversal code limits the maximum depth to MEDIA_ENTITY_ENUM_MAX_DEPTH, -currently defined as 16. - -Drivers initiate a graph traversal by calling - - media_entity_graph_walk_start(struct media_entity_graph *graph, - struct media_entity *entity); - -The graph structure, provided by the caller, is initialized to start graph -traversal at the given entity. - -Drivers can then retrieve the next entity by calling - - media_entity_graph_walk_next(struct media_entity_graph *graph); - -When the graph traversal is complete the function will return NULL. - -Graph traversal can be interrupted at any moment. No cleanup function call is -required and the graph structure can be freed normally. - -Helper functions can be used to find a link between two given pads, or a pad -connected to another pad through an enabled link - - media_entity_find_link(struct media_pad *source, - struct media_pad *sink); - - media_entity_remote_pad(struct media_pad *pad); - -Refer to the kerneldoc documentation for more information. - - -Use count and power handling ----------------------------- - -Due to the wide differences between drivers regarding power management needs, -the media controller does not implement power management. However, the -media_entity structure includes a use_count field that media drivers can use to -track the number of users of every entity for power management needs. - -The use_count field is owned by media drivers and must not be touched by entity -drivers. Access to the field must be protected by the media device graph_mutex -lock. - - -Links setup ------------ - -Link properties can be modified at runtime by calling - - media_entity_setup_link(struct media_link *link, u32 flags); - -The flags argument contains the requested new link flags. - -The only configurable property is the ENABLED link flag to enable/disable a -link. Links marked with the IMMUTABLE link flag can not be enabled or disabled. - -When a link is enabled or disabled, the media framework calls the -link_setup operation for the two entities at the source and sink of the link, -in that order. If the second link_setup call fails, another link_setup call is -made on the first entity to restore the original link flags. - -Media device drivers can be notified of link setup operations by setting the -media_device::link_notify pointer to a callback function. If provided, the -notification callback will be called before enabling and after disabling -links. - -Entity drivers must implement the link_setup operation if any of their links -is non-immutable. The operation must either configure the hardware or store -the configuration information to be applied later. - -Link configuration must not have any side effect on other links. If an enabled -link at a sink pad prevents another link at the same pad from being enabled, -the link_setup operation must return -EBUSY and can't implicitly disable the -first enabled link. - - -Pipelines and media streams ---------------------------- - -When starting streaming, drivers must notify all entities in the pipeline to -prevent link states from being modified during streaming by calling - - media_entity_pipeline_start(struct media_entity *entity, - struct media_pipeline *pipe); - -The function will mark all entities connected to the given entity through -enabled links, either directly or indirectly, as streaming. - -The media_pipeline instance pointed to by the pipe argument will be stored in -every entity in the pipeline. Drivers should embed the media_pipeline structure -in higher-level pipeline structures and can then access the pipeline through -the media_entity pipe field. - -Calls to media_entity_pipeline_start() can be nested. The pipeline pointer must -be identical for all nested calls to the function. - -media_entity_pipeline_start() may return an error. In that case, it will -clean up any of the changes it did by itself. - -When stopping the stream, drivers must notify the entities with - - media_entity_pipeline_stop(struct media_entity *entity); - -If multiple calls to media_entity_pipeline_start() have been made the same -number of media_entity_pipeline_stop() calls are required to stop streaming. The -media_entity pipe field is reset to NULL on the last nested stop call. - -Link configuration will fail with -EBUSY by default if either end of the link is -a streaming entity. Links that can be modified while streaming must be marked -with the MEDIA_LNK_FL_DYNAMIC flag. - -If other operations need to be disallowed on streaming entities (such as -changing entities configuration parameters) drivers can explicitly check the -media_entity stream_count field to find out if an entity is streaming. This -operation must be done with the media_device graph_mutex held. - - -Link validation ---------------- - -Link validation is performed by media_entity_pipeline_start() for any -entity which has sink pads in the pipeline. The -media_entity::link_validate() callback is used for that purpose. In -link_validate() callback, entity driver should check that the properties of -the source pad of the connected entity and its own sink pad match. It is up -to the type of the entity (and in the end, the properties of the hardware) -what matching actually means. - -Subsystems should facilitate link validation by providing subsystem specific -helper functions to provide easy access for commonly needed information, and -in the end provide a way to use driver-specific callbacks. diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index aef948730..904ee42d0 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU: (*) On any given CPU, dependent memory accesses will be issued in order, with respect to itself. This means that for: - WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q); + Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q); the CPU will issue the following memory operations: @@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU: and always in that order. On most systems, smp_read_barrier_depends() does nothing, but it is required for DEC Alpha. The READ_ONCE() - and WRITE_ONCE() are required to prevent compiler mischief. Please - note that you should normally use something like rcu_dereference() - instead of open-coding smp_read_barrier_depends(). + is required to prevent compiler mischief. Please note that you + should normally use something like rcu_dereference() instead of + open-coding smp_read_barrier_depends(). (*) Overlapping loads and stores within a particular CPU will appear to be ordered within that CPU. This means that for: @@ -1655,17 +1655,18 @@ macro is a good place to start looking. SMP memory barriers are reduced to compiler barriers on uniprocessor compiled systems because it is assumed that a CPU will appear to be self-consistent, and will order overlapping accesses correctly with respect to itself. +However, see the subsection on "Virtual Machine Guests" below. [!] Note that SMP memory barriers _must_ be used to control the ordering of references to shared memory on SMP systems, though the use of locking instead is sufficient. Mandatory barriers should not be used to control SMP effects, since mandatory -barriers unnecessarily impose overhead on UP systems. They may, however, be -used to control MMIO effects on accesses through relaxed memory I/O windows. -These are required even on non-SMP systems as they affect the order in which -memory operations appear to a device by prohibiting both the compiler and the -CPU from reordering them. +barriers impose unnecessary overhead on both SMP and UP systems. They may, +however, be used to control MMIO effects on accesses through relaxed memory I/O +windows. These barriers are required even on non-SMP systems as they affect +the order in which memory operations appear to a device by prohibiting both the +compiler and the CPU from reordering them. There are some more advanced barrier functions: @@ -1673,8 +1674,8 @@ There are some more advanced barrier functions: (*) smp_store_mb(var, value) This assigns the value to the variable and then inserts a full memory - barrier after it, depending on the function. It isn't guaranteed to - insert anything more than a compiler barrier in a UP compilation. + barrier after it. It isn't guaranteed to insert anything more than a + compiler barrier in a UP compilation. (*) smp_mb__before_atomic(); @@ -2948,6 +2949,23 @@ The Alpha defines the Linux kernel's memory barrier model. See the subsection on "Cache Coherency" above. +VIRTUAL MACHINE GUESTS +------------------- + +Guests running within virtual machines might be affected by SMP effects even if +the guest itself is compiled without SMP support. This is an artifact of +interfacing with an SMP host while running an UP kernel. Using mandatory +barriers for this use-case would be possible but is often suboptimal. + +To handle this case optimally, low-level virt_mb() etc macros are available. +These have the same effect as smp_mb() etc when SMP is enabled, but generate +identical code for SMP and non-SMP systems. For example, virtual machine guests +should use virt_mb() rather than smp_mb() when synchronizing against a +(possibly SMP) host. + +These are equivalent to smp_mb() etc counterparts in all other respects, +in particular, they do not control MMIO effects: to control +MMIO effects, use mandatory barriers. ============ EXAMPLE USES diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt index e129b2479..f8c3284bf 100644 --- a/Documentation/mtd/nand_ecc.txt +++ b/Documentation/mtd/nand_ecc.txt @@ -107,7 +107,7 @@ for (i = 0; i < 256; i++) if (i & 0x01) rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1; else - rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1; + rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0; if (i & 0x02) rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3; else @@ -127,7 +127,7 @@ for (i = 0; i < 256; i++) if (i & 0x20) rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11; else - rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10; + rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10; if (i & 0x40) rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13; else @@ -158,7 +158,7 @@ the values in any order. So instead of calculating all the bits individually, let us try to rearrange things. For the column parity this is easy. We can just xor the bytes and in the end filter out the relevant bits. This is pretty nice as it will bring -all cp calculation out of the if loop. +all cp calculation out of the for loop. Similarly we can first xor the bytes for the various rows. This leads to: @@ -271,11 +271,11 @@ to write our code in such a way that we process data in 32 bit chunks. Of course this means some modification as the row parity is byte by byte. A quick analysis: for the column parity we use the par variable. When extending to 32 bits -we can in the end easily calculate p0 and p1 from it. +we can in the end easily calculate rp0 and rp1 from it. (because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0 -respectively) +respectively, from MSB to LSB) also rp2 and rp3 can be easily retrieved from par as rp3 covers the -first two bytes and rp2 the last two bytes. +first two MSBs and rp2 covers the last two LSBs. Note that of course now the loop is executed only 64 times (256/4). And note that care must taken wrt byte ordering. The way bytes are @@ -387,11 +387,11 @@ Analysis 2 The code (of course) works, and hurray: we are a little bit faster than the linux driver code (about 15%). But wait, don't cheer too quickly. -THere is more to be gained. +There is more to be gained. If we look at e.g. rp14 and rp15 we see that we either xor our data with rp14 or with rp15. However we also have par which goes over all data. This means there is no need to calculate rp14 as it can be calculated from -rp15 through rp14 = par ^ rp15; +rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15; (or if desired we can avoid calculating rp15 and calculate it from rp14). That is why some places refer to inverse parity. Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13. @@ -419,12 +419,12 @@ with if (i & 0x20) rp15 ^= cur; and outside the loop added: - rp4 = par ^ rp5; - rp6 = par ^ rp7; - rp8 = par ^ rp9; - rp10 = par ^ rp11; - rp12 = par ^ rp13; - rp14 = par ^ rp15; + rp4 = par ^ rp5; + rp6 = par ^ rp7; + rp8 = par ^ rp9; + rp10 = par ^ rp11; + rp12 = par ^ rp13; + rp14 = par ^ rp15; And after that the code takes about 30% more time, although the number of statements is reduced. This is also reflected in the assembly code. @@ -524,12 +524,12 @@ THe code within the for loop was changed to: cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; cur = *bp++; tmppar ^= cur; rp6 ^= cur; - cur = *bp++; tmppar ^= cur; rp4 ^= cur; - cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; - cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur; cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur; - cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur; cur = *bp++; tmppar ^= cur; rp8 ^= cur; cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; @@ -537,7 +537,7 @@ THe code within the for loop was changed to: cur = *bp++; tmppar ^= cur; rp4 ^= cur; cur = *bp++; tmppar ^= cur; - par ^= tmppar; + par ^= tmppar; if ((i & 0x1) == 0) rp12 ^= tmppar; if ((i & 0x2) == 0) rp14 ^= tmppar; } @@ -548,8 +548,8 @@ to rp12 and rp14. While making the changes I also found that I could exploit that tmppar contains the running parity for this iteration. So instead of having: -rp4 ^= cur; rp6 = cur; -I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next +rp4 ^= cur; rp6 ^= cur; +I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next statement. A similar change was done for rp8 and rp10 @@ -593,22 +593,22 @@ The new code now looks like: cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; cur = *bp++; tmppar ^= cur; rp6 ^= cur; - cur = *bp++; tmppar ^= cur; rp4 ^= cur; - cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; - notrp8 = tmppar; - cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; + notrp8 = tmppar; + cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; cur = *bp++; tmppar ^= cur; rp6 ^= cur; - cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; cur = *bp++; tmppar ^= cur; - rp8 = rp8 ^ tmppar ^ notrp8; + rp8 = rp8 ^ tmppar ^ notrp8; cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; cur = *bp++; tmppar ^= cur; rp6 ^= cur; cur = *bp++; tmppar ^= cur; rp4 ^= cur; cur = *bp++; tmppar ^= cur; - par ^= tmppar; + par ^= tmppar; if ((i & 0x1) == 0) rp12 ^= tmppar; if ((i & 0x2) == 0) rp14 ^= tmppar; } @@ -700,7 +700,7 @@ Conclusion The gain when calculating the ecc is tremendous. Om my development hardware a speedup of a factor of 18 for ecc calculation was achieved. On a test on an embedded system with a MIPS core a factor 7 was obtained. -On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor +On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor 5 (big endian mode, gcc 4.1.2, -O3) For correction not much gain could be obtained (as bitflips are rare). Then again there are also much less cycles spent there. diff --git a/Documentation/networking/batman-adv.txt b/Documentation/networking/batman-adv.txt index 58e49042f..ff23b755f 100644 --- a/Documentation/networking/batman-adv.txt +++ b/Documentation/networking/batman-adv.txt @@ -115,14 +115,17 @@ The "bat0" interface can be used like any other regular inter- face. It needs an IP address which can be either statically con- figured or dynamically (by using DHCP or similar services): -# NodeA: ifconfig bat0 192.168.0.1 -# NodeB: ifconfig bat0 192.168.0.2 +# NodeA: ip link set up dev bat0 +# NodeA: ip addr add 192.168.0.1/24 dev bat0 + +# NodeB: ip link set up dev bat0 +# NodeB: ip addr add 192.168.0.2/24 dev bat0 # NodeB: ping 192.168.0.1 Note: In order to avoid problems remove all IP addresses previ- ously assigned to interfaces now used by batman advanced, e.g. -# ifconfig eth0 0.0.0.0 +# ip addr flush dev eth0 LOGGING/DEBUGGING diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt index 05fd83bb3..6ab619fcc 100644 --- a/Documentation/networking/can.txt +++ b/Documentation/networking/can.txt @@ -372,6 +372,15 @@ solution for a couple of reasons: nbytes = sendto(s, &frame, sizeof(struct can_frame), 0, (struct sockaddr*)&addr, sizeof(addr)); + An accurate timestamp can be obtained with an ioctl(2) call after reading + a message from the socket: + + struct timeval tv; + ioctl(s, SIOCGSTAMP, &tv); + + The timestamp has a resolution of one microsecond and is set automatically + at the reception of a CAN frame. + Remark about CAN FD (flexible data rate) support: Generally the handling of CAN FD is very similar to the formerly described diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2ea4c45cf..73b36d7c7 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER after probes started. Default value: 75sec i.e. connection will be aborted after ~11 minutes of retries. +tcp_l3mdev_accept - BOOLEAN + Enables child sockets to inherit the L3 master device index. + Enabling this option allows a "global" listen socket to work + across L3 master domains (e.g., VRFs) with connected sockets + derived from the listen socket to be bound to the L3 domain in + which the packets originated. Only valid when the kernel was + compiled with CONFIG_NET_L3_MASTER_DEV. + tcp_low_latency - BOOLEAN If set, the TCP stack makes decisions that prefer lower latency as opposed to higher throughput. By default, this @@ -586,7 +594,7 @@ tcp_fastopen - INTEGER tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt - will be retransmitted. Should not be higher than 255. Default value + will be retransmitted. Should not be higher than 127. Default value is 6, which corresponds to 63seconds till the last retransmission with the current initial RTO of 1second. With this the final timeout for an active TCP connection attempt will happen after 127seconds. @@ -1723,6 +1731,25 @@ addip_enable - BOOLEAN Default: 0 +pf_enable - INTEGER + Enable or disable pf (pf is short for potentially failed) state. A value + of pf_retrans > path_max_retrans also disables pf state. That is, one of + both pf_enable and pf_retrans > path_max_retrans can disable pf state. + Since pf_retrans and path_max_retrans can be changed by userspace + application, sometimes user expects to disable pf state by the value of + pf_retrans > path_max_retrans, but occasionally the value of pf_retrans + or path_max_retrans is changed by the user application, this pf state is + enabled. As such, it is necessary to add this to dynamically enable + and disable pf state. See: + https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for + details. + + 1: Enable pf. + + 0: Disable pf. + + Default: 1 + addip_noauth_enable - BOOLEAN Dynamic Address Reconfiguration (ADD-IP) requires the use of authentication to protect the operations of adding or removing new @@ -1799,7 +1826,9 @@ pf_retrans - INTEGER having to reduce path_max_retrans to a very low value. See: http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt for details. Note also that a value of pf_retrans > path_max_retrans - disables this feature + disables this feature. Since both pf_retrans and path_max_retrans can + be changed by userspace application, a variable pf_enable is used to + disable pf state. Default: 0 diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 91994134e..fad63136e 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -304,8 +304,12 @@ certain netdevs from flooding unicast traffic for which there is no FDB entry. IGMP Snooping ^^^^^^^^^^^^^ -XXX: complete this section - +In order to support IGMP snooping, the port netdevs should trap to the bridge +driver all IGMP join and leave messages. +The bridge multicast module will notify port netdevs on every multicast group +changed whether it is static configured or dynamically joined/leave. +The hardware implementation should be forwarding all registered multicast +traffic groups only to the configured ports. L3 Routing Offload ------------------ diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt index b0e911e0e..44558882a 100644 --- a/Documentation/power/pci.txt +++ b/Documentation/power/pci.txt @@ -999,7 +999,7 @@ from its probe routine to make runtime PM work for the device. It is important to remember that the driver's runtime_suspend() callback may be executed right after the usage counter has been decremented, because -user space may already have cuased the pm_runtime_allow() helper function +user space may already have caused the pm_runtime_allow() helper function unblocking the runtime PM of the device to run via sysfs, so the driver must be prepared to cope with that. diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 0784bc3a2..7328cf852 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -371,6 +371,12 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: - increment the device's usage counter, run pm_runtime_resume(dev) and return its result + int pm_runtime_get_if_in_use(struct device *dev); + - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the + runtime PM status is RPM_ACTIVE and the runtime PM usage counter is + nonzero, increment the counter and return 1; otherwise return 0 without + changing the counter + void pm_runtime_put_noidle(struct device *dev); - decrement the device's usage counter diff --git a/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt deleted file mode 100644 index 0c6a2163a..000000000 --- a/Documentation/power/tuxonice-internals.txt +++ /dev/null @@ -1,532 +0,0 @@ - TuxOnIce 4.0 Internal Documentation. - Updated to 23 March 2015 - -(Please note that incremental image support mentioned in this document is work -in progress. This document may need updating prior to the actual release of -4.0!) - -1. Introduction. - - TuxOnIce 4.0 is an addition to the Linux Kernel, designed to - allow the user to quickly shutdown and quickly boot a computer, without - needing to close documents or programs. It is equivalent to the - hibernate facility in some laptops. This implementation, however, - requires no special BIOS or hardware support. - - The code in these files is based upon the original implementation - prepared by Gabor Kuti and additional work by Pavel Machek and a - host of others. This code has been substantially reworked by Nigel - Cunningham, again with the help and testing of many others, not the - least of whom are Bernard Blackham and Michael Frank. At its heart, - however, the operation is essentially the same as Gabor's version. - -2. Overview of operation. - - The basic sequence of operations is as follows: - - a. Quiesce all other activity. - b. Ensure enough memory and storage space are available, and attempt - to free memory/storage if necessary. - c. Allocate the required memory and storage space. - d. Write the image. - e. Power down. - - There are a number of complicating factors which mean that things are - not as simple as the above would imply, however... - - o The activity of each process must be stopped at a point where it will - not be holding locks necessary for saving the image, or unexpectedly - restart operations due to something like a timeout and thereby make - our image inconsistent. - - o It is desirous that we sync outstanding I/O to disk before calculating - image statistics. This reduces corruption if one should suspend but - then not resume, and also makes later parts of the operation safer (see - below). - - o We need to get as close as we can to an atomic copy of the data. - Inconsistencies in the image will result in inconsistent memory contents at - resume time, and thus in instability of the system and/or file system - corruption. This would appear to imply a maximum image size of one half of - the amount of RAM, but we have a solution... (again, below). - - o In 2.6 and later, we choose to play nicely with the other suspend-to-disk - implementations. - -3. Detailed description of internals. - - a. Quiescing activity. - - Safely quiescing the system is achieved using three separate but related - aspects. - - First, we use the vanilla kerne's support for freezing processes. This code - is based on the observation that the vast majority of processes don't need - to run during suspend. They can be 'frozen'. The kernel therefore - implements a refrigerator routine, which processes enter and in which they - remain until the cycle is complete. Processes enter the refrigerator via - try_to_freeze() invocations at appropriate places. A process cannot be - frozen in any old place. It must not be holding locks that will be needed - for writing the image or freezing other processes. For this reason, - userspace processes generally enter the refrigerator via the signal - handling code, and kernel threads at the place in their event loops where - they drop locks and yield to other processes or sleep. The task of freezing - processes is complicated by the fact that there can be interdependencies - between processes. Freezing process A before process B may mean that - process B cannot be frozen, because it stops at waiting for process A - rather than in the refrigerator. This issue is seen where userspace waits - on freezeable kernel threads or fuse filesystem threads. To address this - issue, we implement the following algorithm for quiescing activity: - - - Freeze filesystems (including fuse - userspace programs starting - new requests are immediately frozen; programs already running - requests complete their work before being frozen in the next - step) - - Freeze userspace - - Thaw filesystems (this is safe now that userspace is frozen and no - fuse requests are outstanding). - - Invoke sys_sync (noop on fuse). - - Freeze filesystems - - Freeze kernel threads - - If we need to free memory, we thaw kernel threads and filesystems, but not - userspace. We can then free caches without worrying about deadlocks due to - swap files being on frozen filesystems or such like. - - b. Ensure enough memory & storage are available. - - We have a number of constraints to meet in order to be able to successfully - suspend and resume. - - First, the image will be written in two parts, described below. One of - these parts needs to have an atomic copy made, which of course implies a - maximum size of one half of the amount of system memory. The other part - ('pageset') is not atomically copied, and can therefore be as large or - small as desired. - - Second, we have constraints on the amount of storage available. In these - calculations, we may also consider any compression that will be done. The - cryptoapi module allows the user to configure an expected compression ratio. - - Third, the user can specify an arbitrary limit on the image size, in - megabytes. This limit is treated as a soft limit, so that we don't fail the - attempt to suspend if we cannot meet this constraint. - - c. Allocate the required memory and storage space. - - Having done the initial freeze, we determine whether the above constraints - are met, and seek to allocate the metadata for the image. If the constraints - are not met, or we fail to allocate the required space for the metadata, we - seek to free the amount of memory that we calculate is needed and try again. - We allow up to four iterations of this loop before aborting the cycle. If - we do fail, it should only be because of a bug in TuxOnIce's calculations - or the vanilla kernel code for freeing memory. - - These steps are merged together in the prepare_image function, found in - prepare_image.c. The functions are merged because of the cyclical nature - of the problem of calculating how much memory and storage is needed. Since - the data structures containing the information about the image must - themselves take memory and use storage, the amount of memory and storage - required changes as we prepare the image. Since the changes are not large, - only one or two iterations will be required to achieve a solution. - - The recursive nature of the algorithm is miminised by keeping user space - frozen while preparing the image, and by the fact that our records of which - pages are to be saved and which pageset they are saved in use bitmaps (so - that changes in number or fragmentation of the pages to be saved don't - feedback via changes in the amount of memory needed for metadata). The - recursiveness is thus limited to any extra slab pages allocated to store the - extents that record storage used, and the effects of seeking to free memory. - - d. Write the image. - - We previously mentioned the need to create an atomic copy of the data, and - the half-of-memory limitation that is implied in this. This limitation is - circumvented by dividing the memory to be saved into two parts, called - pagesets. - - Pageset2 contains most of the page cache - the pages on the active and - inactive LRU lists that aren't needed or modified while TuxOnIce is - running, so they can be safely written without an atomic copy. They are - therefore saved first and reloaded last. While saving these pages, - TuxOnIce carefully ensures that the work of writing the pages doesn't make - the image inconsistent. With the support for Kernel (Video) Mode Setting - going into the kernel at the time of writing, we need to check for pages - on the LRU that are used by KMS, and exclude them from pageset2. They are - atomically copied as part of pageset 1. - - Once pageset2 has been saved, we prepare to do the atomic copy of remaining - memory. As part of the preparation, we power down drivers, thereby providing - them with the opportunity to have their state recorded in the image. The - amount of memory allocated by drivers for this is usually negligible, but if - DRI is in use, video drivers may require significants amounts. Ideally we - would be able to query drivers while preparing the image as to the amount of - memory they will need. Unfortunately no such mechanism exists at the time of - writing. For this reason, TuxOnIce allows the user to set an - 'extra_pages_allowance', which is used to seek to ensure sufficient memory - is available for drivers at this point. TuxOnIce also lets the user set this - value to 0. In this case, a test driver suspend is done while preparing the - image, and the difference (plus a margin) used instead. TuxOnIce will also - automatically restart the hibernation process (twice at most) if it finds - that the extra pages allowance is not sufficient. It will then use what was - actually needed (plus a margin, again). Failure to hibernate should thus - be an extremely rare occurence. - - Having suspended the drivers, we save the CPU context before making an - atomic copy of pageset1, resuming the drivers and saving the atomic copy. - After saving the two pagesets, we just need to save our metadata before - powering down. - - As we mentioned earlier, the contents of pageset2 pages aren't needed once - they've been saved. We therefore use them as the destination of our atomic - copy. In the unlikely event that pageset1 is larger, extra pages are - allocated while the image is being prepared. This is normally only a real - possibility when the system has just been booted and the page cache is - small. - - This is where we need to be careful about syncing, however. Pageset2 will - probably contain filesystem meta data. If this is overwritten with pageset1 - and then a sync occurs, the filesystem will be corrupted - at least until - resume time and another sync of the restored data. Since there is a - possibility that the user might not resume or (may it never be!) that - TuxOnIce might oops, we do our utmost to avoid syncing filesystems after - copying pageset1. - - e. Incremental images - - TuxOnIce 4.0 introduces a new incremental image mode which changes things a - little. When incremental images are enabled, we save a 'normal' image the - first time we hibernate. One resume however, we do not free the image or - the associated storage. Instead, it is retained until the next attempt at - hibernating and a mechanism is enabled which is used to track which pages - of memory are modified between the two cycles. The modified pages can then - be added to the existing image, rather than unmodified pages being saved - again unnecessarily. - - Incremental image support is available in 64 bit Linux only, due to the - requirement for extra page flags. - - This support is accomplished in the following way: - - 1) Tracking of pages. - - The tracking of changed pages is accomplished using the page fault - mechanism. When we reach a point at which we want to start tracking - changes, most pages are marked read-only and also flagged as being - read-only because of this support. Since this cannot happen for every page - of RAM, some are marked as untracked and always treated as modified whn - preparing an incremental iamge. When a process attempts to modify a page - that is marked read-only in this way, a page fault occurs, with TuxOnIce - code marking the page writable and dirty before allowing the write to - continue. In this way, the effect of incremental images on performance is - minimised - a page only causes a fault once. Small modifications to the - page allocator further reduce the number of faults that occur - free pages - are not tracked; they are made writable and marked as dirty as part of - being allocated. - - 2) Saving the incremental image / atomicity. - - The page fault mechanism is also used to improve the means by which - atomicity of the image is acheived. When it is time to do an atomic copy, - the flags for pages are reset, with the result being that it is no longer - necessary for us to do an atomic of pageset1. Instead, we normally write - the uncopied pages to disk. When an attempt is made to modify a page that - has not yet been saved, the page-fault mechanism makes a copy of the page - prior to allowing the write. This copy is then written to disk. Likewise, - on resume, if a process attempts to write to a page that has been read - while the rest of the image is still being loaded, a copy of that page is - made prior to the write being allowed. At the end of loading the image, - modified pages can thus be restored to their 'atomic copy' contents prior - to restarting normal operation. We also mark pages that are yet to be read - as invalid PFNs, so that we can capture as a bug any attempt by a - half-restored kernel to access a page that hasn't yet been reloaded. - - f. Power down. - - Powering down uses standard kernel routines. TuxOnIce supports powering down - using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off. - Supporting suspend to ram (S3) as a power off option might sound strange, - but it allows the user to quickly get their system up and running again if - the battery doesn't run out (we just need to re-read the overwritten pages) - and if the battery does run out (or the user removes power), they can still - resume. - -4. Data Structures. - - TuxOnIce uses three main structures to store its metadata and configuration - information: - - a) Pageflags bitmaps. - - TuxOnIce records which pages will be in pageset1, pageset2, the destination - of the atomic copy and the source of the atomically restored image using - bitmaps. The code used is that written for swsusp, with small improvements - to match TuxOnIce's requirements. - - The pageset1 bitmap is thus easily stored in the image header for use at - resume time. - - As mentioned above, using bitmaps also means that the amount of memory and - storage required for recording the above information is constant. This - greatly simplifies the work of preparing the image. In earlier versions of - TuxOnIce, extents were used to record which pages would be stored. In that - case, however, eating memory could result in greater fragmentation of the - lists of pages, which in turn required more memory to store the extents and - more storage in the image header. These could in turn require further - freeing of memory, and another iteration. All of this complexity is removed - by having bitmaps. - - Bitmaps also make a lot of sense because TuxOnIce only ever iterates - through the lists. There is therefore no cost to not being able to find the - nth page in order 0 time. We only need to worry about the cost of finding - the n+1th page, given the location of the nth page. Bitwise optimisations - help here. - - b) Extents for block data. - - TuxOnIce supports writing the image to multiple block devices. In the case - of swap, multiple partitions and/or files may be in use, and we happily use - them all (with the exception of compcache pages, which we allocate but do - not use). This use of multiple block devices is accomplished as follows: - - Whatever the actual source of the allocated storage, the destination of the - image can be viewed in terms of one or more block devices, and on each - device, a list of sectors. To simplify matters, we only use contiguous, - PAGE_SIZE aligned sectors, like the swap code does. - - Since sector numbers on each bdev may well not start at 0, it makes much - more sense to use extents here. Contiguous ranges of pages can thus be - represented in the extents by contiguous values. - - Variations in block size are taken account of in transforming this data - into the parameters for bio submission. - - We can thus implement a layer of abstraction wherein the core of TuxOnIce - doesn't have to worry about which device we're currently writing to or - where in the device we are. It simply requests that the next page in the - pageset or header be written, leaving the details to this lower layer. - The lower layer remembers where in the sequence of devices and blocks each - pageset starts. The header always starts at the beginning of the allocated - storage. - - So extents are: - - struct extent { - unsigned long minimum, maximum; - struct extent *next; - } - - These are combined into chains of extents for a device: - - struct extent_chain { - int size; /* size of the extent ie sum (max-min+1) */ - int allocs, frees; - char *name; - struct extent *first, *last_touched; - }; - - For each bdev, we need to store a little more info (simplified definition): - - struct toi_bdev_info { - struct block_device *bdev; - - char uuid[17]; - dev_t dev_t; - int bmap_shift; - int blocks_per_page; - }; - - The uuid is the main means used to identify the device in the storage - image. This means we can cope with the dev_t representation of a device - changing between saving the image and restoring it, as may happen on some - bioses or in the LVM case. - - bmap_shift and blocks_per_page apply the effects of variations in blocks - per page settings for the filesystem and underlying bdev. For most - filesystems, these are the same, but for xfs, they can have independant - values. - - Combining these two structures together, we have everything we need to - record what devices and what blocks on each device are being used to - store the image, and to submit i/o using bio_submit. - - The last elements in the picture are a means of recording how the storage - is being used. - - We do this first and foremost by implementing a layer of abstraction on - top of the devices and extent chains which allows us to view however many - devices there might be as one long storage tape, with a single 'head' that - tracks a 'current position' on the tape: - - struct extent_iterate_state { - struct extent_chain *chains; - int num_chains; - int current_chain; - struct extent *current_extent; - unsigned long current_offset; - }; - - That is, *chains points to an array of size num_chains of extent chains. - For the filewriter, this is always a single chain. For the swapwriter, the - array is of size MAX_SWAPFILES. - - current_chain, current_extent and current_offset thus point to the current - index in the chains array (and into a matching array of struct - suspend_bdev_info), the current extent in that chain (to optimise access), - and the current value in the offset. - - The image is divided into three parts: - - The header - - Pageset 1 - - Pageset 2 - - The header always starts at the first device and first block. We know its - size before we begin to save the image because we carefully account for - everything that will be stored in it. - - The second pageset (LRU) is stored first. It begins on the next page after - the end of the header. - - The first pageset is stored second. It's start location is only known once - pageset2 has been saved, since pageset2 may be compressed as it is written. - This location is thus recorded at the end of saving pageset2. It is page - aligned also. - - Since this information is needed at resume time, and the location of extents - in memory will differ at resume time, this needs to be stored in a portable - way: - - struct extent_iterate_saved_state { - int chain_num; - int extent_num; - unsigned long offset; - }; - - We can thus implement a layer of abstraction wherein the core of TuxOnIce - doesn't have to worry about which device we're currently writing to or - where in the device we are. It simply requests that the next page in the - pageset or header be written, leaving the details to this layer, and - invokes the routines to remember and restore the position, without having - to worry about the details of how the data is arranged on disk or such like. - - c) Modules - - One aim in designing TuxOnIce was to make it flexible. We wanted to allow - for the implementation of different methods of transforming a page to be - written to disk and different methods of getting the pages stored. - - In early versions (the betas and perhaps Suspend1), compression support was - inlined in the image writing code, and the data structures and code for - managing swap were intertwined with the rest of the code. A number of people - had expressed interest in implementing image encryption, and alternative - methods of storing the image. - - In order to achieve this, TuxOnIce was given a modular design. - - A module is a single file which encapsulates the functionality needed - to transform a pageset of data (encryption or compression, for example), - or to write the pageset to a device. The former type of module is called - a 'page-transformer', the later a 'writer'. - - Modules are linked together in pipeline fashion. There may be zero or more - page transformers in a pipeline, and there is always exactly one writer. - The pipeline follows this pattern: - - --------------------------------- - | TuxOnIce Core | - --------------------------------- - | - | - --------------------------------- - | Page transformer 1 | - --------------------------------- - | - | - --------------------------------- - | Page transformer 2 | - --------------------------------- - | - | - --------------------------------- - | Writer | - --------------------------------- - - During the writing of an image, the core code feeds pages one at a time - to the first module. This module performs whatever transformations it - implements on the incoming data, completely consuming the incoming data and - feeding output in a similar manner to the next module. - - All routines are SMP safe, and the final result of the transformations is - written with an index (provided by the core) and size of the output by the - writer. As a result, we can have multithreaded I/O without needing to - worry about the sequence in which pages are written (or read). - - During reading, the pipeline works in the reverse direction. The core code - calls the first module with the address of a buffer which should be filled. - (Note that the buffer size is always PAGE_SIZE at this time). This module - will in turn request data from the next module and so on down until the - writer is made to read from the stored image. - - Part of definition of the structure of a module thus looks like this: - - int (*rw_init) (int rw, int stream_number); - int (*rw_cleanup) (int rw); - int (*write_chunk) (struct page *buffer_page); - int (*read_chunk) (struct page *buffer_page, int sync); - - It should be noted that the _cleanup routine may be called before the - full stream of data has been read or written. While writing the image, - the user may (depending upon settings) choose to abort suspending, and - if we are in the midst of writing the last portion of the image, a portion - of the second pageset may be reread. This may also happen if an error - occurs and we seek to abort the process of writing the image. - - The modular design is also useful in a number of other ways. It provides - a means where by we can add support for: - - - providing overall initialisation and cleanup routines; - - serialising configuration information in the image header; - - providing debugging information to the user; - - determining memory and image storage requirements; - - dis/enabling components at run-time; - - configuring the module (see below); - - ...and routines for writers specific to their work: - - Parsing a resume= location; - - Determining whether an image exists; - - Marking a resume as having been attempted; - - Invalidating an image; - - Since some parts of the core - the user interface and storage manager - support - have use for some of these functions, they are registered as - 'miscellaneous' modules as well. - - d) Sysfs data structures. - - This brings us naturally to support for configuring TuxOnIce. We desired to - provide a way to make TuxOnIce as flexible and configurable as possible. - The user shouldn't have to reboot just because they want to now hibernate to - a file instead of a partition, for example. - - To accomplish this, TuxOnIce implements a very generic means whereby the - core and modules can register new sysfs entries. All TuxOnIce entries use - a single _store and _show routine, both of which are found in - tuxonice_sysfs.c in the kernel/power directory. These routines handle the - most common operations - getting and setting the values of bits, integers, - longs, unsigned longs and strings in one place, and allow overrides for - customised get and set options as well as side-effect routines for all - reads and writes. - - When combined with some simple macros, a new sysfs entry can then be defined - in just a couple of lines: - - SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, - 2048, 0, NULL), - - This defines a sysfs entry named "progress_granularity" which is rw and - allows the user to access an integer stored at &progress_granularity, giving - it a value between 1 and 2048 inclusive. - - Sysfs entries are registered under /sys/power/tuxonice, and entries for - modules are located in a subdirectory named after the module. - diff --git a/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt deleted file mode 100644 index 3bf0575ef..000000000 --- a/Documentation/power/tuxonice.txt +++ /dev/null @@ -1,948 +0,0 @@ - --- TuxOnIce, version 3.0 --- - -1. What is it? -2. Why would you want it? -3. What do you need to use it? -4. Why not just use the version already in the kernel? -5. How do you use it? -6. What do all those entries in /sys/power/tuxonice do? -7. How do you get support? -8. I think I've found a bug. What should I do? -9. When will XXX be supported? -10 How does it work? -11. Who wrote TuxOnIce? - -1. What is it? - - Imagine you're sitting at your computer, working away. For some reason, you - need to turn off your computer for a while - perhaps it's time to go home - for the day. When you come back to your computer next, you're going to want - to carry on where you left off. Now imagine that you could push a button and - have your computer store the contents of its memory to disk and power down. - Then, when you next start up your computer, it loads that image back into - memory and you can carry on from where you were, just as if you'd never - turned the computer off. You have far less time to start up, no reopening of - applications or finding what directory you put that file in yesterday. - That's what TuxOnIce does. - - TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who, - with some help from Pavel Machek, got an early version going in 1999. The - project was then taken over by Florent Chabaud while still in alpha version - numbers. Nigel Cunningham came on the scene when Florent was unable to - continue, moving the project into betas, then 1.0, 2.0 and so on up to - the present series. During the 2.0 series, the name was contracted to - Suspend2 and the website suspend2.net created. Beginning around July 2007, - a transition to calling the software TuxOnIce was made, to seek to help - make it clear that TuxOnIce is more concerned with hibernation than suspend - to ram. - - Pavel Machek's swsusp code, which was merged around 2.5.17 retains the - original name, and was essentially a fork of the beta code until Rafael - Wysocki came on the scene in 2005 and began to improve it further. - -2. Why would you want it? - - Why wouldn't you want it? - - Being able to save the state of your system and quickly restore it improves - your productivity - you get a useful system in far less time than through - the normal boot process. You also get to be completely 'green', using zero - power, or as close to that as possible (the computer may still provide - minimal power to some devices, so they can initiate a power on, but that - will be the same amount of power as would be used if you told the computer - to shutdown. - -3. What do you need to use it? - - a. Kernel Support. - - i) The TuxOnIce patch. - - TuxOnIce is part of the Linux Kernel. This version is not part of Linus's - 2.6 tree at the moment, so you will need to download the kernel source and - apply the latest patch. Having done that, enable the appropriate options in - make [menu|x]config (under Power Management Options - look for "Enhanced - Hibernation"), compile and install your kernel. TuxOnIce works with SMP, - Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64. - - TuxOnIce patches are available from http://tuxonice.net. - - ii) Compression support. - - Compression support is implemented via the cryptoapi. You will therefore want - to select any Cryptoapi transforms that you want to use on your image from - the Cryptoapi menu while configuring your kernel. We recommend the use of the - LZO compression method - it is very fast and still achieves good compression. - - You can also tell TuxOnIce to write its image to an encrypted and/or - compressed filesystem/swap partition. In that case, you don't need to do - anything special for TuxOnIce when it comes to kernel configuration. - - iii) Configuring other options. - - While you're configuring your kernel, try to configure as much as possible - to build as modules. We recommend this because there are a number of drivers - that are still in the process of implementing proper power management - support. In those cases, the best way to work around their current lack is - to build them as modules and remove the modules while hibernating. You might - also bug the driver authors to get their support up to speed, or even help! - - b. Storage. - - i) Swap. - - TuxOnIce can store the hibernation image in your swap partition, a swap file or - a combination thereof. Whichever combination you choose, you will probably - want to create enough swap space to store the largest image you could have, - plus the space you'd normally use for swap. A good rule of thumb would be - to calculate the amount of swap you'd want without using TuxOnIce, and then - add the amount of memory you have. This swapspace can be arranged in any way - you'd like. It can be in one partition or file, or spread over a number. The - only requirement is that they be active when you start a hibernation cycle. - - There is one exception to this requirement. TuxOnIce has the ability to turn - on one swap file or partition at the start of hibernating and turn it back off - at the end. If you want to ensure you have enough memory to store a image - when your memory is fully used, you might want to make one swap partition or - file for 'normal' use, and another for TuxOnIce to activate & deactivate - automatically. (Further details below). - - ii) Normal files. - - TuxOnIce includes a 'file allocator'. The file allocator can store your - image in a simple file. Since Linux has the concept of everything being a - file, this is more powerful than it initially sounds. If, for example, you - were to set up a network block device file, you could hibernate to a network - server. This has been tested and works to a point, but nbd itself isn't - stateless enough for our purposes. - - Take extra care when setting up the file allocator. If you just type - commands without thinking and then try to hibernate, you could cause - irreversible corruption on your filesystems! Make sure you have backups. - - Most people will only want to hibernate to a local file. To achieve that, do - something along the lines of: - - echo "TuxOnIce" > /hibernation-file - dd if=/dev/zero bs=1M count=512 >> /hibernation-file - - This will create a 512MB file called /hibernation-file. To get TuxOnIce to use - it: - - echo /hibernation-file > /sys/power/tuxonice/file/target - - Then - - cat /sys/power/tuxonice/resume - - Put the results of this into your bootloader's configuration (see also step - C, below): - - ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- - # cat /sys/power/tuxonice/resume - file:/dev/hda2:0x1e001 - - In this example, we would edit the append= line of our lilo.conf|menu.lst - so that it included: - - resume=file:/dev/hda2:0x1e001 - ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- - - For those who are thinking 'Could I make the file sparse?', the answer is - 'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in - a sparse file while hibernating. In the longer term (post merge!), I'd like - to change things so that the file could be dynamically resized and have - holes filled as needed. Right now, however, that's not possible and not a - priority. - - c. Bootloader configuration. - - Using TuxOnIce also requires that you add an extra parameter to - your lilo.conf or equivalent. Here's an example for a swap partition: - - append="resume=swap:/dev/hda1" - - This would tell TuxOnIce that /dev/hda1 is a swap partition you - have. TuxOnIce will use the swap signature of this partition as a - pointer to your data when you hibernate. This means that (in this example) - /dev/hda1 doesn't need to be _the_ swap partition where all of your data - is actually stored. It just needs to be a swap partition that has a - valid signature. - - You don't need to have a swap partition for this purpose. TuxOnIce - can also use a swap file, but usage is a little more complex. Having made - your swap file, turn it on and do - - cat /sys/power/tuxonice/swap/headerlocations - - (this assumes you've already compiled your kernel with TuxOnIce - support and booted it). The results of the cat command will tell you - what you need to put in lilo.conf: - - For swap partitions like /dev/hda1, simply use resume=/dev/hda1. - For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d. - - If the swapfile changes for any reason (it is moved to a different - location, it is deleted and recreated, or the filesystem is - defragmented) then you will have to check - /sys/power/tuxonice/swap/headerlocations for a new resume_block value. - - Once you've compiled and installed the kernel and adjusted your bootloader - configuration, you should only need to reboot for the most basic part - of TuxOnIce to be ready. - - If you only compile in the swap allocator, or only compile in the file - allocator, you don't need to add the "swap:" part of the resume= - parameters above. resume=/dev/hda2:0x242d will work just as well. If you - have compiled both and your storage is on swap, you can also use this - format (the swap allocator is the default allocator). - - When compiling your kernel, one of the options in the 'Power Management - Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is - called 'Default resume partition'. This can be used to set a default value - for the resume= parameter. - - d. The hibernate script. - - Since the driver model in 2.6 kernels is still being developed, you may need - to do more than just configure TuxOnIce. Users of TuxOnIce usually start the - process via a script which prepares for the hibernation cycle, tells the - kernel to do its stuff and then restore things afterwards. This script might - involve: - - - Switching to a text console and back if X doesn't like the video card - status on resume. - - Un/reloading drivers that don't play well with hibernation. - - Note that you might not be able to unload some drivers if there are - processes using them. You might have to kill off processes that hold - devices open. Hint: if your X server accesses an USB mouse, doing a - 'chvt' to a text console releases the device and you can unload the - module. - - Check out the latest script (available on tuxonice.net). - - e. The userspace user interface. - - TuxOnIce has very limited support for displaying status if you only apply - the kernel patch - it can printk messages, but that is all. In addition, - some of the functions mentioned in this document (such as cancelling a cycle - or performing interactive debugging) are unavailable. To utilise these - functions, or simply get a nice display, you need the 'userui' component. - Userui comes in three flavours, usplash, fbsplash and text. Text should - work on any console. Usplash and fbsplash require the appropriate - (distro specific?) support. - - To utilise a userui, TuxOnIce just needs to be told where to find the - userspace binary: - - echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program - - The hibernate script can do this for you, and a default value for this - setting can be configured when compiling the kernel. This path is also - stored in the image header, so if you have an initrd or initramfs, you can - use the userui during the first part of resuming (prior to the atomic - restore) by putting the binary in the same path in your initrd/ramfs. - Alternatively, you can put it in a different location and do an echo - similar to the above prior to the echo > do_resume. The value saved in the - image header will then be ignored. - -4. Why not just use the version already in the kernel? - - The version in the vanilla kernel has a number of drawbacks. The most - serious of these are: - - it has a maximum image size of 1/2 total memory; - - it doesn't allocate storage until after it has snapshotted memory. - This means that you can't be sure hibernating will work until you - see it start to write the image; - - it does not allow you to press escape to cancel a cycle; - - it does not allow you to press escape to cancel resuming; - - it does not allow you to automatically swapon a file when - starting a cycle; - - it does not allow you to use multiple swap partitions or files; - - it does not allow you to use ordinary files; - - it just invalidates an image and continues to boot if you - accidentally boot the wrong kernel after hibernating; - - it doesn't support any sort of nice display while hibernating; - - it is moving toward requiring that you have an initrd/initramfs - to ever have a hope of resuming (uswsusp). While uswsusp will - address some of the concerns above, it won't address all of them, - and will be more complicated to get set up; - - it doesn't have support for suspend-to-both (write a hibernation - image, then suspend to ram; I think this is known as ReadySafe - under M$). - -5. How do you use it? - - A hibernation cycle can be started directly by doing: - - echo > /sys/power/tuxonice/do_hibernate - - In practice, though, you'll probably want to use the hibernate script - to unload modules, configure the kernel the way you like it and so on. - In that case, you'd do (as root): - - hibernate - - See the hibernate script's man page for more details on the options it - takes. - - If you're using the text or splash user interface modules, one feature of - TuxOnIce that you might find useful is that you can press Escape at any time - during hibernating, and the process will be aborted. - - Due to the way hibernation works, this means you'll have your system back and - perfectly usable almost instantly. The only exception is when it's at the - very end of writing the image. Then it will need to reload a small (usually - 4-50MBs, depending upon the image characteristics) portion first. - - Likewise, when resuming, you can press escape and resuming will be aborted. - The computer will then powerdown again according to settings at that time for - the powerdown method or rebooting. - - You can change the settings for powering down while the image is being - written by pressing 'R' to toggle rebooting and 'O' to toggle between - suspending to ram and powering down completely). - - If you run into problems with resuming, adding the "noresume" option to - the kernel command line will let you skip the resume step and recover your - system. This option shouldn't normally be needed, because TuxOnIce modifies - the image header prior to the atomic restore, and will thus prompt you - if it detects that you've tried to resume an image before (this flag is - removed if you press Escape to cancel a resume, so you won't be prompted - then). - - Recent kernels (2.6.24 onwards) add support for resuming from a different - kernel to the one that was hibernated (thanks to Rafael for his work on - this - I've just embraced and enhanced the support for TuxOnIce). This - should further reduce the need for you to use the noresume option. - -6. What do all those entries in /sys/power/tuxonice do? - - /sys/power/tuxonice is the directory which contains files you can use to - tune and configure TuxOnIce to your liking. The exact contents of - the directory will depend upon the version of TuxOnIce you're - running and the options you selected at compile time. In the following - descriptions, names in brackets refer to compile time options. - (Note that they're all dependant upon you having selected CONFIG_TUXONICE - in the first place!). - - Since the values of these settings can open potential security risks, the - writeable ones are accessible only to the root user. You may want to - configure sudo to allow you to invoke your hibernate script as an ordinary - user. - - - alloc/failure_test - - This debugging option provides a way of testing TuxOnIce's handling of - memory allocation failures. Each allocation type that TuxOnIce makes has - been given a unique number (see the source code). Echo the appropriate - number into this entry, and when TuxOnIce attempts to do that allocation, - it will pretend there was a failure and act accordingly. - - - alloc/find_max_mem_allocated - - This debugging option will cause TuxOnIce to find the maximum amount of - memory it used during a cycle, and report that information in debugging - information at the end of the cycle. - - - alt_resume_param - - Instead of powering down after writing a hibernation image, TuxOnIce - supports resuming from a different image. This entry lets you set the - location of the signature for that image (the resume= value you'd use - for it). Using an alternate image and keep_image mode, you can do things - like using an alternate image to power down an uninterruptible power - supply. - - - block_io/target_outstanding_io - - This value controls the amount of memory that the block I/O code says it - needs when the core code is calculating how much memory is needed for - hibernating and for resuming. It doesn't directly control the amount of - I/O that is submitted at any one time - that depends on the amount of - available memory (we may have more available than we asked for), the - throughput that is being achieved and the ability of the CPU to keep up - with disk throughput (particularly where we're compressing pages). - - - checksum/enabled - - Use cryptoapi hashing routines to verify that Pageset2 pages don't change - while we're saving the first part of the image, and to get any pages that - do change resaved in the atomic copy. This should normally not be needed, - but if you're seeing issues, please enable this. If your issues stop you - being able to resume, enable this option, hibernate and cancel the cycle - after the atomic copy is done. If the debugging info shows a non-zero - number of pages resaved, please report this to Nigel. - - - compression/algorithm - - Set the cryptoapi algorithm used for compressing the image. - - - compression/expected_compression - - These values allow you to set an expected compression ratio, which TuxOnice - will use in calculating whether it meets constraints on the image size. If - this expected compression ratio is not attained, the hibernation cycle will - abort, so it is wise to allow some spare. You can see what compression - ratio is achieved in the logs after hibernating. - - - debug_info: - - This file returns information about your configuration that may be helpful - in diagnosing problems with hibernating. - - - did_suspend_to_both: - - This file can be used when you hibernate with powerdown method 3 (ie suspend - to ram after writing the image). There can be two outcomes in this case. We - can resume from the suspend-to-ram before the battery runs out, or we can run - out of juice and and up resuming like normal. This entry lets you find out, - post resume, which way we went. If the value is 1, we resumed from suspend - to ram. This can be useful when actions need to be run post suspend-to-ram - that don't need to be run if we did the normal resume from power off. - - - do_hibernate: - - When anything is written to this file, the kernel side of TuxOnIce will - begin to attempt to write an image to disk and power down. You'll normally - want to run the hibernate script instead, to get modules unloaded first. - - - do_resume: - - When anything is written to this file TuxOnIce will attempt to read and - restore an image. If there is no image, it will return almost immediately. - If an image exists, the echo > will never return. Instead, the original - kernel context will be restored and the original echo > do_hibernate will - return. - - - */enabled - - These option can be used to temporarily disable various parts of TuxOnIce. - - - extra_pages_allowance - - When TuxOnIce does its atomic copy, it calls the driver model suspend - and resume methods. If you have DRI enabled with a driver such as fglrx, - this can result in the driver allocating a substantial amount of memory - for storing its state. Extra_pages_allowance tells TuxOnIce how much - extra memory it should ensure is available for those allocations. If - your attempts at hibernating end with a message in dmesg indicating that - insufficient extra pages were allowed, you need to increase this value. - - - file/target: - - Read this value to get the current setting. Write to it to point TuxOnice - at a new storage location for the file allocator. See section 3.b.ii above - for details of how to set up the file allocator. - - - freezer_test - - This entry can be used to get TuxOnIce to just test the freezer and prepare - an image without actually doing a hibernation cycle. It is useful for - diagnosing freezing and image preparation issues. - - - full_pageset2 - - TuxOnIce divides the pages that are stored in an image into two sets. The - difference between the two sets is that pages in pageset 1 are atomically - copied, and pages in pageset 2 are written to disk without being copied - first. A page CAN be written to disk without being copied first if and only - if its contents will not be modified or used at any time after userspace - processes are frozen. A page MUST be in pageset 1 if its contents are - modified or used at any time after userspace processes have been frozen. - - Normally (ie if this option is enabled), TuxOnIce will put all pages on the - per-zone LRUs in pageset2, then remove those pages used by any userspace - user interface helper and TuxOnIce storage manager that are running, - together with pages used by the GEM memory manager introduced around 2.6.28 - kernels. - - If this option is disabled, a much more conservative approach will be taken. - The only pages in pageset2 will be those belonging to userspace processes, - with the exclusion of those belonging to the TuxOnIce userspace helpers - mentioned above. This will result in a much smaller pageset2, and will - therefore result in smaller images than are possible with this option - enabled. - - - ignore_rootfs - - TuxOnIce records which device is mounted as the root filesystem when - writing the hibernation image. It will normally check at resume time that - this device isn't already mounted - that would be a cause of filesystem - corruption. In some particular cases (RAM based root filesystems), you - might want to disable this check. This option allows you to do that. - - - image_exists: - - Can be used in a script to determine whether a valid image exists at the - location currently pointed to by resume=. Returns up to three lines. - The first is whether an image exists (-1 for unsure, otherwise 0 or 1). - If an image eixsts, additional lines will return the machine and version. - Echoing anything to this entry removes any current image. - - - image_size_limit: - - The maximum size of hibernation image written to disk, measured in megabytes - (1024*1024). - - - last_result: - - The result of the last hibernation cycle, as defined in - include/linux/suspend-debug.h with the values SUSPEND_ABORTED to - SUSPEND_KEPT_IMAGE. This is a bitmask. - - - late_cpu_hotplug: - - This sysfs entry controls whether cpu hotplugging is done - as normal - just - before (unplug) and after (replug) the atomic copy/restore (so that all - CPUs/cores are available for multithreaded I/O). The alternative is to - unplug all secondary CPUs/cores at the start of hibernating/resuming, and - replug them at the end of resuming. No multithreaded I/O will be possible in - this configuration, but the odd machine has been reported to require it. - - - lid_file: - - This determines which ACPI button file we look in to determine whether the - lid is open or closed after resuming from suspend to disk or power off. - If the entry is set to "lid/LID", we'll open /proc/acpi/button/lid/LID/state - and check its contents at the appropriate moment. See post_wake_state below - for more details on how this entry is used. - - - log_everything (CONFIG_PM_DEBUG): - - Setting this option results in all messages printed being logged. Normally, - only a subset are logged, so as to not slow the process and not clutter the - logs. Useful for debugging. It can be toggled during a cycle by pressing - 'L'. - - - no_load_direct: - - This is a debugging option. If, when loading the atomically copied pages of - an image, TuxOnIce finds that the destination address for a page is free, - it will normally allocate the image, load the data directly into that - address and skip it in the atomic restore. If this option is disabled, the - page will be loaded somewhere else and atomically restored like other pages. - - - no_flusher_thread: - - When doing multithreaded I/O (see below), the first online CPU can be used - to _just_ submit compressed pages when writing the image, rather than - compressing and submitting data. This option is normally disabled, but has - been included because Nigel would like to see whether it will be more useful - as the number of cores/cpus in computers increases. - - - no_multithreaded_io: - - TuxOnIce will normally create one thread per cpu/core on your computer, - each of which will then perform I/O. This will generally result in - throughput that's the maximum the storage medium can handle. There - shouldn't be any reason to disable multithreaded I/O now, but this option - has been retained for debugging purposes. - - - no_pageset2 - - See the entry for full_pageset2 above for an explanation of pagesets. - Enabling this option causes TuxOnIce to do an atomic copy of all pages, - thereby limiting the maximum image size to 1/2 of memory, as swsusp does. - - - no_pageset2_if_unneeded - - See the entry for full_pageset2 above for an explanation of pagesets. - Enabling this option causes TuxOnIce to act like no_pageset2 was enabled - if and only it isn't needed anyway. This option may still make TuxOnIce - less reliable because pageset2 pages are normally used to store the - atomic copy - drivers that want to do allocations of larger amounts of - memory in one shot will be more likely to find that those amounts aren't - available if this option is enabled. - - - pause_between_steps (CONFIG_PM_DEBUG): - - This option is used during debugging, to make TuxOnIce pause between - each step of the process. It is ignored when the nice display is on. - - - post_wake_state: - - TuxOnIce provides support for automatically waking after a user-selected - delay, and using a different powerdown method if the lid is still closed. - (Yes, we're assuming a laptop). This entry lets you choose what state - should be entered next. The values are those described under - powerdown_method, below. It can be used to suspend to RAM after hibernating, - then powerdown properly (say) 20 minutes. It can also be used to power down - properly, then wake at (say) 6.30am and suspend to RAM until you're ready - to use the machine. - - - powerdown_method: - - Used to select a method by which TuxOnIce should powerdown after writing the - image. Currently: - - 0: Don't use ACPI to power off. - 3: Attempt to enter Suspend-to-ram. - 4: Attempt to enter ACPI S4 mode. - 5: Attempt to power down via ACPI S5 mode. - - Note that these options are highly dependant upon your hardware & software: - - 3: When succesful, your machine suspends to ram instead of powering off. - The advantage of using this mode is that it doesn't matter whether your - battery has enough charge to make it through to your next resume. If it - lasts, you will simply resume from suspend to ram (and the image on disk - will be discarded). If the battery runs out, you will resume from disk - instead. The disadvantage is that it takes longer than a normal - suspend-to-ram to enter the state, since the suspend-to-disk image needs - to be written first. - 4/5: When successful, your machine will be off and comsume (almost) no power. - But it might still react to some external events like opening the lid or - trafic on a network or usb device. For the bios, resume is then the same - as warm boot, similar to a situation where you used the command `reboot' - to reboot your machine. If your machine has problems on warm boot or if - you want to protect your machine with the bios password, this is probably - not the right choice. Mode 4 may be necessary on some machines where ACPI - wake up methods need to be run to properly reinitialise hardware after a - hibernation cycle. - 0: Switch the machine completely off. The only possible wakeup is the power - button. For the bios, resume is then the same as a cold boot, in - particular you would have to provide your bios boot password if your - machine uses that feature for booting. - - - progressbar_granularity_limit: - - This option can be used to limit the granularity of the progress bar - displayed with a bootsplash screen. The value is the maximum number of - steps. That is, 10 will make the progress bar jump in 10% increments. - - - reboot: - - This option causes TuxOnIce to reboot rather than powering down - at the end of saving an image. It can be toggled during a cycle by pressing - 'R'. - - - resume: - - This sysfs entry can be used to read and set the location in which TuxOnIce - will look for the signature of an image - the value set using resume= at - boot time or CONFIG_PM_STD_PARTITION ("Default resume partition"). By - writing to this file as well as modifying your bootloader's configuration - file (eg menu.lst), you can set or reset the location of your image or the - method of storing the image without rebooting. - - - replace_swsusp (CONFIG_TOI_REPLACE_SWSUSP): - - This option makes - - echo disk > /sys/power/state - - activate TuxOnIce instead of swsusp. Regardless of whether this option is - enabled, any invocation of swsusp's resume time trigger will cause TuxOnIce - to check for an image too. This is due to the fact that at resume time, we - can't know whether this option was enabled until we see if an image is there - for us to resume from. (And when an image exists, we don't care whether we - did replace swsusp anyway - we just want to resume). - - - resume_commandline: - - This entry can be read after resuming to see the commandline that was used - when resuming began. You might use this to set up two bootloader entries - that are the same apart from the fact that one includes a extra append= - argument "at_work=1". You could then grep resume_commandline in your - post-resume scripts and configure networking (for example) differently - depending upon whether you're at home or work. resume_commandline can be - set to arbitrary text if you wish to remove sensitive contents. - - - swap/swapfilename: - - This entry is used to specify the swapfile or partition that - TuxOnIce will attempt to swapon/swapoff automatically. Thus, if - I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically - for my hibernation image, I would - - echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile - - /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the - swapon and swapoff occur while other processes are frozen (including kswapd) - so this swap file will not be used up when attempting to free memory. The - parition/file is also given the highest priority, so other swapfiles/partitions - will only be used to save the image when this one is filled. - - The value of this file is used by headerlocations along with any currently - activated swapfiles/partitions. - - - swap/headerlocations: - - This option tells you the resume= options to use for swap devices you - currently have activated. It is particularly useful when you only want to - use a swap file to store your image. See above for further details. - - - test_bio - - This is a debugging option. When enabled, TuxOnIce will not hibernate. - Instead, when asked to write an image, it will skip the atomic copy, - just doing the writing of the image and then returning control to the - user at the point where it would have powered off. This is useful for - testing throughput in different configurations. - - - test_filter_speed - - This is a debugging option. When enabled, TuxOnIce will not hibernate. - Instead, when asked to write an image, it will not write anything or do - an atomic copy, but will only run any enabled compression algorithm on the - data that would have been written (the source pages of the atomic copy in - the case of pageset 1). This is useful for comparing the performance of - compression algorithms and for determining the extent to which an upgrade - to your storage method would improve hibernation speed. - - - user_interface/debug_sections (CONFIG_PM_DEBUG): - - This value, together with the console log level, controls what debugging - information is displayed. The console log level determines the level of - detail, and this value determines what detail is displayed. This value is - a bit vector, and the meaning of the bits can be found in the kernel tree - in include/linux/tuxonice.h. It can be overridden using the kernel's - command line option suspend_dbg. - - - user_interface/default_console_level (CONFIG_PM_DEBUG): - - This determines the value of the console log level at the start of a - hibernation cycle. If debugging is compiled in, the console log level can be - changed during a cycle by pressing the digit keys. Meanings are: - - 0: Nice display. - 1: Nice display plus numerical progress. - 2: Errors only. - 3: Low level debugging info. - 4: Medium level debugging info. - 5: High level debugging info. - 6: Verbose debugging info. - - - user_interface/enable_escape: - - Setting this to "1" will enable you abort a hibernation cycle or resuming by - pressing escape, "0" (default) disables this feature. Note that enabling - this option means that you cannot initiate a hibernation cycle and then walk - away from your computer, expecting it to be secure. With feature disabled, - you can validly have this expectation once TuxOnice begins to write the - image to disk. (Prior to this point, it is possible that TuxOnice might - about because of failure to freeze all processes or because constraints - on its ability to save the image are not met). - - - user_interface/program - - This entry is used to tell TuxOnice what userspace program to use for - providing a user interface while hibernating. The program uses a netlink - socket to pass messages back and forward to the kernel, allowing all of the - functions formerly implemented in the kernel user interface components. - - - version: - - The version of TuxOnIce you have compiled into the currently running kernel. - - - wake_alarm_dir: - - As mentioned above (post_wake_state), TuxOnIce supports automatically waking - after some delay. This entry allows you to select which wake alarm to use. - It should contain the value "rtc0" if you're wanting to use - /sys/class/rtc/rtc0. - - - wake_delay: - - This value determines the delay from the end of writing the image until the - wake alarm is triggered. You can set an absolute time by writing the desired - time into /sys/class/rtc//wakealarm and leaving these values - empty. - - Note that for the wakeup to actually occur, you may need to modify entries - in /proc/acpi/wakeup. This is done by echoing the name of the button in the - first column (eg PBTN) into the file. - -7. How do you get support? - - Glad you asked. TuxOnIce is being actively maintained and supported - by Nigel (the guy doing most of the kernel coding at the moment), Bernard - (who maintains the hibernate script and userspace user interface components) - and its users. - - Resources availble include HowTos, FAQs and a Wiki, all available via - tuxonice.net. You can find the mailing lists there. - -8. I think I've found a bug. What should I do? - - By far and a way, the most common problems people have with TuxOnIce - related to drivers not having adequate power management support. In this - case, it is not a bug with TuxOnIce, but we can still help you. As we - mentioned above, such issues can usually be worked around by building the - functionality as modules and unloading them while hibernating. Please visit - the Wiki for up-to-date lists of known issues and work arounds. - - If this information doesn't help, try running: - - hibernate --bug-report - - ..and sending the output to the users mailing list. - - Good information on how to provide us with useful information from an - oops is found in the file REPORTING-BUGS, in the top level directory - of the kernel tree. If you get an oops, please especially note the - information about running what is printed on the screen through ksymoops. - The raw information is useless. - -9. When will XXX be supported? - - If there's a feature missing from TuxOnIce that you'd like, feel free to - ask. We try to be obliging, within reason. - - Patches are welcome. Please send to the list. - -10. How does it work? - - TuxOnIce does its work in a number of steps. - - a. Freezing system activity. - - The first main stage in hibernating is to stop all other activity. This is - achieved in stages. Processes are considered in fours groups, which we will - describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE - flag, kernel threads without this flag, userspace processes with the - PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are - untouched by the refrigerator code. They are allowed to run during hibernating - and resuming, and are used to support user interaction, storage access or the - like. Other kernel threads (those unneeded while hibernating) are frozen last. - This leaves us with userspace processes that need to be frozen. When a - process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on - that process for the duration of that call. Processes that have this flag are - frozen after processes without it, so that we can seek to ensure that dirty - data is synced to disk as quickly as possible in a situation where other - processes may be submitting writes at the same time. Freezing the processes - that are submitting data stops new I/O from being submitted. Syncthreads can - then cleanly finish their work. So the order is: - - - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE; - - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE); - - Kernel processes without PF_NOFREEZE. - - b. Eating memory. - - For a successful hibernation cycle, you need to have enough disk space to store the - image and enough memory for the various limitations of TuxOnIce's - algorithm. You can also specify a maximum image size. In order to attain - to those constraints, TuxOnIce may 'eat' memory. If, after freezing - processes, the constraints aren't met, TuxOnIce will thaw all the - other processes and begin to eat memory until its calculations indicate - the constraints are met. It will then freeze processes again and recheck - its calculations. - - c. Allocation of storage. - - Next, TuxOnIce allocates the storage that will be used to save - the image. - - The core of TuxOnIce knows nothing about how or where pages are stored. We - therefore request the active allocator (remember you might have compiled in - more than one!) to allocate enough storage for our expect image size. If - this request cannot be fulfilled, we eat more memory and try again. If it - is fulfiled, we seek to allocate additional storage, just in case our - expected compression ratio (if any) isn't achieved. This time, however, we - just continue if we can't allocate enough storage. - - If these calls to our allocator change the characteristics of the image - such that we haven't allocated enough memory, we also loop. (The allocator - may well need to allocate space for its storage information). - - d. Write the first part of the image. - - TuxOnIce stores the image in two sets of pages called 'pagesets'. - Pageset 2 contains pages on the active and inactive lists; essentially - the page cache. Pageset 1 contains all other pages, including the kernel. - We use two pagesets for one important reason: We need to make an atomic copy - of the kernel to ensure consistency of the image. Without a second pageset, - that would limit us to an image that was at most half the amount of memory - available. Using two pagesets allows us to store a full image. Since pageset - 2 pages won't be needed in saving pageset 1, we first save pageset 2 pages. - We can then make our atomic copy of the remaining pages using both pageset 2 - pages and any other pages that are free. While saving both pagesets, we are - careful not to corrupt the image. Among other things, we use lowlevel block - I/O routines that don't change the pagecache contents. - - The next step, then, is writing pageset 2. - - e. Suspending drivers and storing processor context. - - Having written pageset2, TuxOnIce calls the power management functions to - notify drivers of the hibernation, and saves the processor state in preparation - for the atomic copy of memory we are about to make. - - f. Atomic copy. - - At this stage, everything else but the TuxOnIce code is halted. Processes - are frozen or idling, drivers are quiesced and have stored (ideally and where - necessary) their configuration in memory we are about to atomically copy. - In our lowlevel architecture specific code, we have saved the CPU state. - We can therefore now do our atomic copy before resuming drivers etc. - - g. Save the atomic copy (pageset 1). - - TuxOnice can then write the atomic copy of the remaining pages. Since we - have copied the pages into other locations, we can continue to use the - normal block I/O routines without fear of corruption our image. - - f. Save the image header. - - Nearly there! We save our settings and other parameters needed for - reloading pageset 1 in an 'image header'. We also tell our allocator to - serialise its data at this stage, so that it can reread the image at resume - time. - - g. Set the image header. - - Finally, we edit the header at our resume= location. The signature is - changed by the allocator to reflect the fact that an image exists, and to - point to the start of that data if necessary (swap allocator). - - h. Power down. - - Or reboot if we're debugging and the appropriate option is selected. - - Whew! - - Reloading the image. - -------------------- - - Reloading the image is essentially the reverse of all the above. We load - our copy of pageset 1, being careful to choose locations that aren't going - to be overwritten as we copy it back (We start very early in the boot - process, so there are no other processes to quiesce here). We then copy - pageset 1 back to its original location in memory and restore the process - context. We are now running with the original kernel. Next, we reload the - pageset 2 pages, free the memory and swap used by TuxOnIce, restore - the pageset header and restart processes. Sounds easy in comparison to - hibernating, doesn't it! - - There is of course more to TuxOnIce than this, but this explanation - should be a good start. If there's interest, I'll write further - documentation on range pages and the low level I/O. - -11. Who wrote TuxOnIce? - - (Answer based on the writings of Florent Chabaud, credits in files and - Nigel's limited knowledge; apologies to anyone missed out!) - - The main developers of TuxOnIce have been... - - Gabor Kuti - Pavel Machek - Florent Chabaud - Bernard Blackham - Nigel Cunningham - - Significant portions of swsusp, the code in the vanilla kernel which - TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should - also be expressed to him. - - The above mentioned developers have been aided in their efforts by a host - of hundreds, if not thousands of testers and people who have submitted bug - fixes & suggestions. Of special note are the efforts of Michael Frank, who - had his computers repetitively hibernate and resume for literally tens of - thousands of cycles and developed scripts to stress the system and test - TuxOnIce far beyond the point most of us (Nigel included!) would consider - testing. His efforts have contributed as much to TuxOnIce as any of the - names above. diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index b784c2701..5d1128bf0 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -250,6 +250,12 @@ dentry names: Passed by reference. +block_device names: + + %pg sda, sda1 or loop0p1 + + For printing name of block_device pointers. + struct va_format: %pV @@ -300,15 +306,6 @@ Network device features: Passed by reference. -Command from struct task_struct - - %pT ls - - For printing executable name excluding path from struct - task_struct. - - Passed by reference. - If you add other %p extensions, please extend lib/test_printf.c with one or more test cases, if at all feasible. diff --git a/Documentation/s390/zfcpdump.txt b/Documentation/s390/zfcpdump.txt index dc929be96..b064aa597 100644 --- a/Documentation/s390/zfcpdump.txt +++ b/Documentation/s390/zfcpdump.txt @@ -15,19 +15,15 @@ the s390-tools package) to make the device bootable. The operator of a Linux system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump resides on. -The kernel part of zfcpdump is implemented as a debugfs file under "zcore/mem", -which exports memory and registers of the crashed Linux in an s390 -standalone dump format. It can be used in the same way as e.g. /dev/mem. The -dump format defines a 4K header followed by plain uncompressed memory. The -register sets are stored in the prefix pages of the respective CPUs. To build a -dump enabled kernel with the zcore driver, the kernel config option -CONFIG_CRASH_DUMP has to be set. When reading from "zcore/mem", the part of -memory, which has been saved by hardware is read by the driver via the SCLP -hardware interface. The second part is just copied from the non overwritten real -memory. - -Since kernel version 3.12 also the /proc/vmcore file can also be used to access -the dump. +The user space dump tool accesses the memory of the crashed system by means +of the /proc/vmcore interface. This interface exports the crashed system's +memory and registers in ELF core dump format. To access the memory which has +been saved by the hardware SCLP requests will be created at the time the data +is needed by /proc/vmcore. The tail part of the crashed systems memory which +has not been stashed by hardware can just be copied from real memory. + +To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP +has to be set. To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig". diff --git a/Documentation/scsi/link_power_management_policy.txt b/Documentation/scsi/link_power_management_policy.txt index 0285601a6..d18993d01 100644 --- a/Documentation/scsi/link_power_management_policy.txt +++ b/Documentation/scsi/link_power_management_policy.txt @@ -1,11 +1,8 @@ This parameter allows the user to set the link (interface) power management. -There are 4 possible options: +There are 3 possible options: Value Effect ---------------------------------------------------------------------------- -firmware_defaults Inherit configuration from the state programmed by - the firmware during system init. - min_power Tell the controller to try to make the link use the least possible power when possible. This may sacrifice some performance due to increased latency diff --git a/Documentation/security/keys-trusted-encrypted.txt b/Documentation/security/keys-trusted-encrypted.txt index e105ae97a..324ddf522 100644 --- a/Documentation/security/keys-trusted-encrypted.txt +++ b/Documentation/security/keys-trusted-encrypted.txt @@ -27,17 +27,26 @@ Usage: keyctl print keyid options: - keyhandle= ascii hex value of sealing key default 0x40000000 (SRK) - keyauth= ascii hex auth for sealing key default 0x00...i - (40 ascii zeros) - blobauth= ascii hex auth for sealed data default 0x00... - (40 ascii zeros) - blobauth= ascii hex auth for sealed data default 0x00... - (40 ascii zeros) - pcrinfo= ascii hex of PCR_INFO or PCR_INFO_LONG (no default) - pcrlock= pcr number to be extended to "lock" blob - migratable= 0|1 indicating permission to reseal to new PCR values, - default 1 (resealing allowed) + keyhandle= ascii hex value of sealing key default 0x40000000 (SRK) + keyauth= ascii hex auth for sealing key default 0x00...i + (40 ascii zeros) + blobauth= ascii hex auth for sealed data default 0x00... + (40 ascii zeros) + blobauth= ascii hex auth for sealed data default 0x00... + (40 ascii zeros) + pcrinfo= ascii hex of PCR_INFO or PCR_INFO_LONG (no default) + pcrlock= pcr number to be extended to "lock" blob + migratable= 0|1 indicating permission to reseal to new PCR values, + default 1 (resealing allowed) + hash= hash algorithm name as a string. For TPM 1.x the only + allowed value is sha1. For TPM 2.x the allowed values + are sha1, sha256, sha384, sha512 and sm3-256. + policydigest= digest for the authorization policy. must be calculated + with the same hash algorithm as specified by the 'hash=' + option. + policyhandle= handle to an authorization policy session that defines the + same policy and with the same hash algorithm as was used to + seal the key. "keyctl print" returns an ascii hex copy of the sealed key, which is in standard TPM_STORED_DATA format. The key length for new keys are always in bytes. diff --git a/Documentation/sound/alsa/img,spdif-in.txt b/Documentation/sound/alsa/img,spdif-in.txt new file mode 100644 index 000000000..8b7505785 --- /dev/null +++ b/Documentation/sound/alsa/img,spdif-in.txt @@ -0,0 +1,49 @@ +The Imagination Technologies SPDIF Input controller contains the following +controls: + +name='IEC958 Capture Mask',index=0 + +This control returns a mask that shows which of the IEC958 status bits +can be read using the 'IEC958 Capture Default' control. + +name='IEC958 Capture Default',index=0 + +This control returns the status bits contained within the SPDIF stream that +is being received. The 'IEC958 Capture Mask' shows which bits can be read +from this control. + +name='SPDIF In Multi Frequency Acquire',index=0 +name='SPDIF In Multi Frequency Acquire',index=1 +name='SPDIF In Multi Frequency Acquire',index=2 +name='SPDIF In Multi Frequency Acquire',index=3 + +This control is used to attempt acquisition of up to four different sample +rates. The active rate can be obtained by reading the 'SPDIF In Lock Frequency' +control. + +When the value of this control is set to {0,0,0,0}, the rate given to hw_params +will determine the single rate the block will capture. Else, the rate given to +hw_params will be ignored, and the block will attempt capture for each of the +four sample rates set here. + +If less than four rates are required, the same rate can be specified more than +once + +name='SPDIF In Lock Frequency',index=0 + +This control returns the active capture rate, or 0 if a lock has not been +acquired + +name='SPDIF In Lock TRK',index=0 + +This control is used to modify the locking/jitter rejection characteristics +of the block. Larger values increase the locking range, but reduce jitter +rejection. + +name='SPDIF In Lock Acquire Threshold',index=0 + +This control is used to change the threshold at which a lock is acquired. + +name='SPDIF In Lock Release Threshold',index=0 + +This control is used to change the threshold at which a lock is released. diff --git a/Documentation/spi/.gitignore b/Documentation/spi/.gitignore deleted file mode 100644 index 428057639..000000000 --- a/Documentation/spi/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -spidev_fdx -spidev_test diff --git a/Documentation/spi/00-INDEX b/Documentation/spi/00-INDEX index a128fa835..4644bf0d9 100644 --- a/Documentation/spi/00-INDEX +++ b/Documentation/spi/00-INDEX @@ -10,13 +10,9 @@ pxa2xx - PXA2xx SPI master controller build by spi_message fifo wq spidev - Intro to the userspace API for spi devices -spidev_fdx.c - - spidev example file spi-lm70llp - Connecting an LM70-LLP sensor to the kernel via the SPI subsys. spi-sc18is602 - NXP SC18IS602/603 I2C-bus to SPI bridge spi-summary - (Linux) SPI overview. If unsure about SPI or SPI in Linux, start here. -spidev_test.c - - SPI testing utility. diff --git a/Documentation/spi/Makefile b/Documentation/spi/Makefile deleted file mode 100644 index efa255813..000000000 --- a/Documentation/spi/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# List of programs to build -hostprogs-y := spidev_test spidev_fdx - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -HOSTCFLAGS_spidev_test.o += -I$(objtree)/usr/include -HOSTCFLAGS_spidev_fdx.o += -I$(objtree)/usr/include diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c deleted file mode 100644 index 0ea3e5129..000000000 --- a/Documentation/spi/spidev_fdx.c +++ /dev/null @@ -1,158 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - - -static int verbose; - -static void do_read(int fd, int len) -{ - unsigned char buf[32], *bp; - int status; - - /* read at least 2 bytes, no more than 32 */ - if (len < 2) - len = 2; - else if (len > sizeof(buf)) - len = sizeof(buf); - memset(buf, 0, sizeof buf); - - status = read(fd, buf, len); - if (status < 0) { - perror("read"); - return; - } - if (status != len) { - fprintf(stderr, "short read\n"); - return; - } - - printf("read(%2d, %2d): %02x %02x,", len, status, - buf[0], buf[1]); - status -= 2; - bp = buf + 2; - while (status-- > 0) - printf(" %02x", *bp++); - printf("\n"); -} - -static void do_msg(int fd, int len) -{ - struct spi_ioc_transfer xfer[2]; - unsigned char buf[32], *bp; - int status; - - memset(xfer, 0, sizeof xfer); - memset(buf, 0, sizeof buf); - - if (len > sizeof buf) - len = sizeof buf; - - buf[0] = 0xaa; - xfer[0].tx_buf = (unsigned long)buf; - xfer[0].len = 1; - - xfer[1].rx_buf = (unsigned long) buf; - xfer[1].len = len; - - status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); - if (status < 0) { - perror("SPI_IOC_MESSAGE"); - return; - } - - printf("response(%2d, %2d): ", len, status); - for (bp = buf; len; len--) - printf(" %02x", *bp++); - printf("\n"); -} - -static void dumpstat(const char *name, int fd) -{ - __u8 lsb, bits; - __u32 mode, speed; - - if (ioctl(fd, SPI_IOC_RD_MODE32, &mode) < 0) { - perror("SPI rd_mode"); - return; - } - if (ioctl(fd, SPI_IOC_RD_LSB_FIRST, &lsb) < 0) { - perror("SPI rd_lsb_fist"); - return; - } - if (ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits) < 0) { - perror("SPI bits_per_word"); - return; - } - if (ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed) < 0) { - perror("SPI max_speed_hz"); - return; - } - - printf("%s: spi mode 0x%x, %d bits %sper word, %d Hz max\n", - name, mode, bits, lsb ? "(lsb first) " : "", speed); -} - -int main(int argc, char **argv) -{ - int c; - int readcount = 0; - int msglen = 0; - int fd; - const char *name; - - while ((c = getopt(argc, argv, "hm:r:v")) != EOF) { - switch (c) { - case 'm': - msglen = atoi(optarg); - if (msglen < 0) - goto usage; - continue; - case 'r': - readcount = atoi(optarg); - if (readcount < 0) - goto usage; - continue; - case 'v': - verbose++; - continue; - case 'h': - case '?': -usage: - fprintf(stderr, - "usage: %s [-h] [-m N] [-r N] /dev/spidevB.D\n", - argv[0]); - return 1; - } - } - - if ((optind + 1) != argc) - goto usage; - name = argv[optind]; - - fd = open(name, O_RDWR); - if (fd < 0) { - perror("open"); - return 1; - } - - dumpstat(name, fd); - - if (msglen) - do_msg(fd, msglen); - - if (readcount) - do_read(fd, readcount); - - close(fd); - return 0; -} diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c deleted file mode 100644 index 135b3f592..000000000 --- a/Documentation/spi/spidev_test.c +++ /dev/null @@ -1,318 +0,0 @@ -/* - * SPI testing utility (using spidev driver) - * - * Copyright (c) 2007 MontaVista Software, Inc. - * Copyright (c) 2007 Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * Cross-compile with cross-gcc -I/path/to/cross-kernel/include - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) - -static void pabort(const char *s) -{ - perror(s); - abort(); -} - -static const char *device = "/dev/spidev1.1"; -static uint32_t mode; -static uint8_t bits = 8; -static uint32_t speed = 500000; -static uint16_t delay; -static int verbose; - -uint8_t default_tx[] = { - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0x40, 0x00, 0x00, 0x00, 0x00, 0x95, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xF0, 0x0D, -}; - -uint8_t default_rx[ARRAY_SIZE(default_tx)] = {0, }; -char *input_tx; - -static void hex_dump(const void *src, size_t length, size_t line_size, char *prefix) -{ - int i = 0; - const unsigned char *address = src; - const unsigned char *line = address; - unsigned char c; - - printf("%s | ", prefix); - while (length-- > 0) { - printf("%02X ", *address++); - if (!(++i % line_size) || (length == 0 && i % line_size)) { - if (length == 0) { - while (i++ % line_size) - printf("__ "); - } - printf(" | "); /* right close */ - while (line < address) { - c = *line++; - printf("%c", (c < 33 || c == 255) ? 0x2E : c); - } - printf("\n"); - if (length > 0) - printf("%s | ", prefix); - } - } -} - -/* - * Unescape - process hexadecimal escape character - * converts shell input "\x23" -> 0x23 - */ -static int unescape(char *_dst, char *_src, size_t len) -{ - int ret = 0; - char *src = _src; - char *dst = _dst; - unsigned int ch; - - while (*src) { - if (*src == '\\' && *(src+1) == 'x') { - sscanf(src + 2, "%2x", &ch); - src += 4; - *dst++ = (unsigned char)ch; - } else { - *dst++ = *src++; - } - ret++; - } - return ret; -} - -static void transfer(int fd, uint8_t const *tx, uint8_t const *rx, size_t len) -{ - int ret; - - struct spi_ioc_transfer tr = { - .tx_buf = (unsigned long)tx, - .rx_buf = (unsigned long)rx, - .len = len, - .delay_usecs = delay, - .speed_hz = speed, - .bits_per_word = bits, - }; - - if (mode & SPI_TX_QUAD) - tr.tx_nbits = 4; - else if (mode & SPI_TX_DUAL) - tr.tx_nbits = 2; - if (mode & SPI_RX_QUAD) - tr.rx_nbits = 4; - else if (mode & SPI_RX_DUAL) - tr.rx_nbits = 2; - if (!(mode & SPI_LOOP)) { - if (mode & (SPI_TX_QUAD | SPI_TX_DUAL)) - tr.rx_buf = 0; - else if (mode & (SPI_RX_QUAD | SPI_RX_DUAL)) - tr.tx_buf = 0; - } - - ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); - if (ret < 1) - pabort("can't send spi message"); - - if (verbose) - hex_dump(tx, len, 32, "TX"); - hex_dump(rx, len, 32, "RX"); -} - -static void print_usage(const char *prog) -{ - printf("Usage: %s [-DsbdlHOLC3]\n", prog); - puts(" -D --device device to use (default /dev/spidev1.1)\n" - " -s --speed max speed (Hz)\n" - " -d --delay delay (usec)\n" - " -b --bpw bits per word \n" - " -l --loop loopback\n" - " -H --cpha clock phase\n" - " -O --cpol clock polarity\n" - " -L --lsb least significant bit first\n" - " -C --cs-high chip select active high\n" - " -3 --3wire SI/SO signals shared\n" - " -v --verbose Verbose (show tx buffer)\n" - " -p Send data (e.g. \"1234\\xde\\xad\")\n" - " -N --no-cs no chip select\n" - " -R --ready slave pulls low to pause\n" - " -2 --dual dual transfer\n" - " -4 --quad quad transfer\n"); - exit(1); -} - -static void parse_opts(int argc, char *argv[]) -{ - while (1) { - static const struct option lopts[] = { - { "device", 1, 0, 'D' }, - { "speed", 1, 0, 's' }, - { "delay", 1, 0, 'd' }, - { "bpw", 1, 0, 'b' }, - { "loop", 0, 0, 'l' }, - { "cpha", 0, 0, 'H' }, - { "cpol", 0, 0, 'O' }, - { "lsb", 0, 0, 'L' }, - { "cs-high", 0, 0, 'C' }, - { "3wire", 0, 0, '3' }, - { "no-cs", 0, 0, 'N' }, - { "ready", 0, 0, 'R' }, - { "dual", 0, 0, '2' }, - { "verbose", 0, 0, 'v' }, - { "quad", 0, 0, '4' }, - { NULL, 0, 0, 0 }, - }; - int c; - - c = getopt_long(argc, argv, "D:s:d:b:lHOLC3NR24p:v", lopts, NULL); - - if (c == -1) - break; - - switch (c) { - case 'D': - device = optarg; - break; - case 's': - speed = atoi(optarg); - break; - case 'd': - delay = atoi(optarg); - break; - case 'b': - bits = atoi(optarg); - break; - case 'l': - mode |= SPI_LOOP; - break; - case 'H': - mode |= SPI_CPHA; - break; - case 'O': - mode |= SPI_CPOL; - break; - case 'L': - mode |= SPI_LSB_FIRST; - break; - case 'C': - mode |= SPI_CS_HIGH; - break; - case '3': - mode |= SPI_3WIRE; - break; - case 'N': - mode |= SPI_NO_CS; - break; - case 'v': - verbose = 1; - break; - case 'R': - mode |= SPI_READY; - break; - case 'p': - input_tx = optarg; - break; - case '2': - mode |= SPI_TX_DUAL; - break; - case '4': - mode |= SPI_TX_QUAD; - break; - default: - print_usage(argv[0]); - break; - } - } - if (mode & SPI_LOOP) { - if (mode & SPI_TX_DUAL) - mode |= SPI_RX_DUAL; - if (mode & SPI_TX_QUAD) - mode |= SPI_RX_QUAD; - } -} - -int main(int argc, char *argv[]) -{ - int ret = 0; - int fd; - uint8_t *tx; - uint8_t *rx; - int size; - - parse_opts(argc, argv); - - fd = open(device, O_RDWR); - if (fd < 0) - pabort("can't open device"); - - /* - * spi mode - */ - ret = ioctl(fd, SPI_IOC_WR_MODE32, &mode); - if (ret == -1) - pabort("can't set spi mode"); - - ret = ioctl(fd, SPI_IOC_RD_MODE32, &mode); - if (ret == -1) - pabort("can't get spi mode"); - - /* - * bits per word - */ - ret = ioctl(fd, SPI_IOC_WR_BITS_PER_WORD, &bits); - if (ret == -1) - pabort("can't set bits per word"); - - ret = ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits); - if (ret == -1) - pabort("can't get bits per word"); - - /* - * max speed hz - */ - ret = ioctl(fd, SPI_IOC_WR_MAX_SPEED_HZ, &speed); - if (ret == -1) - pabort("can't set max speed hz"); - - ret = ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed); - if (ret == -1) - pabort("can't get max speed hz"); - - printf("spi mode: 0x%x\n", mode); - printf("bits per word: %d\n", bits); - printf("max speed: %d Hz (%d KHz)\n", speed, speed/1000); - - if (input_tx) { - size = strlen(input_tx+1); - tx = malloc(size); - rx = malloc(size); - size = unescape((char *)tx, input_tx, size); - transfer(fd, tx, rx, size); - free(rx); - free(tx); - } else { - transfer(fd, default_tx, default_rx, sizeof(default_tx)); - } - - close(fd); - - return ret; -} diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt index 3049a6122..ffd4575ec 100644 --- a/Documentation/stable_kernel_rules.txt +++ b/Documentation/stable_kernel_rules.txt @@ -93,7 +93,7 @@ format in the sign-off area: Also, some patches may have kernel version prerequisites. This can be specified in the following format in the sign-off area: - Cc: # 3.3.x- + Cc: # 3.3.x- The tag has the meaning of: git cherry-pick diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 88152f214..302b5ed61 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs: - nr_open - overflowuid - overflowgid +- pipe-user-pages-hard +- pipe-user-pages-soft - protected_hardlinks - protected_symlinks - suid_dumpable @@ -159,6 +161,27 @@ The default is 65534. ============================================================== +pipe-user-pages-hard: + +Maximum total number of pages a non-privileged user may allocate for pipes. +Once this limit is reached, no new pipes may be allocated until usage goes +below the limit again. When set to 0, no limit is applied, which is the default +setting. + +============================================================== + +pipe-user-pages-soft: + +Maximum total number of pages a non-privileged user may allocate for pipes +before the pipe size gets limited to a single page. Once this limit is reached, +new pipes will be limited to a single page in size for this user in order to +limit total memory usage, and trying to increase them using fcntl() will be +denied until usage goes below the limit again. The default value allows to +allocate up to 1024 pipes at their default size. When set to 0, no limit is +applied. + +============================================================== + protected_hardlinks: A long-standing class of security issues is the hardlink-based diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index af70d1541..87119dc9b 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -551,6 +551,21 @@ the recommended setting is 60. ============================================================== +panic_on_io_nmi: + +Controls the kernel's behavior when a CPU receives an NMI caused by +an IO error. + +0: try to continue operation (default) + +1: panic immediately. The IO error triggered an NMI. This indicates a + serious system condition which could result in IO data corruption. + Rather than continuing, panicking might be a better choice. Some + servers issue this sort of NMI when the dump button is pushed, + and you can use this option to take a crash dump. + +============================================================== + panic_on_oops: Controls the kernel's behaviour when an oops or BUG is encountered. @@ -745,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued. ============================================================== +sched_schedstats: + +Enables/disables scheduler statistics. Enabling this feature +incurs a small amount of overhead in the scheduler but is +useful for debugging and performance tuning. + +============================================================== + sg-big-buff: This file shows the size of the generic SCSI (sg) buffer. @@ -810,14 +833,13 @@ via the /proc/sys interface: Each write syscall must fully contain the sysctl value to be written, and multiple writes on the same sysctl file descriptor will rewrite the sysctl value, regardless of file position. - 0 - (default) Same behavior as above, but warn about processes that - perform writes to a sysctl file descriptor when the file position - is not 0. - 1 - Respect file position when writing sysctl strings. Multiple writes - will append to the sysctl value buffer. Anything past the max length - of the sysctl value buffer will be ignored. Writes to numeric sysctl - entries must always be at file position 0 and the value must be - fully contained in the buffer sent in the write syscall. + 0 - Same behavior as above, but warn about processes that perform writes + to a sysctl file descriptor when the file position is not 0. + 1 - (default) Respect file position when writing sysctl strings. Multiple + writes will append to the sysctl value buffer. Anything past the max + length of the sysctl value buffer will be ignored. Writes to numeric + sysctl entries must always be at file position 0 and the value must + be fully contained in the buffer sent in the write syscall. ============================================================== diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f72370b44..89a887c76 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -42,6 +42,8 @@ Currently, these files are in /proc/sys/vm: - min_slab_ratio - min_unmapped_ratio - mmap_min_addr +- mmap_rnd_bits +- mmap_rnd_compat_bits - nr_hugepages - nr_overcommit_hugepages - nr_trim_pages (only if CONFIG_MMU=n) @@ -135,7 +137,7 @@ Contains, as a percentage of total available memory that contains free pages and reclaimable pages, the number of pages at which the background kernel flusher threads will start writing out dirty data. -The total avaiable memory is not equal to total system memory. +The total available memory is not equal to total system memory. ============================================================== @@ -170,7 +172,7 @@ Contains, as a percentage of total available memory that contains free pages and reclaimable pages, the number of pages at which a process which is generating disk writes will itself start writing out dirty data. -The total avaiable memory is not equal to total system memory. +The total available memory is not equal to total system memory. ============================================================== @@ -485,6 +487,33 @@ against future potential kernel bugs. ============================================================== +mmap_rnd_bits: + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations on architectures which support +tuning address space randomization. This value will be bounded +by the architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_bits tunable + +============================================================== + +mmap_rnd_compat_bits: + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations for applications run in +compatibility mode on architectures which support tuning address +space randomization. This value will be bounded by the +architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_compat_bits tunable + +============================================================== + nr_hugepages Change the minimum size of the hugepage pool. diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index 10f062ea6..8c745c893 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -364,6 +364,7 @@ integral_cutoff accumulates error when temperature is above the desired temperature trip point. For more information see Documentation/thermal/power_allocator.txt + Unit: millidegree Celsius RW, Optional slope diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt index 767392ffd..a484d2c10 100644 --- a/Documentation/timers/hpet.txt +++ b/Documentation/timers/hpet.txt @@ -1,9 +1,7 @@ High Precision Event Timer Driver for Linux The High Precision Event Timer (HPET) hardware follows a specification -by Intel and Microsoft which can be found at - - http://www.intel.com/hardwaredesign/hpetspec_1.pdf +by Intel and Microsoft, revision 1. Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision") and up to 32 comparators. Normally three or more comparators are provided, diff --git a/Documentation/trace/events-msr.txt b/Documentation/trace/events-msr.txt new file mode 100644 index 000000000..78c383bf0 --- /dev/null +++ b/Documentation/trace/events-msr.txt @@ -0,0 +1,37 @@ + +The x86 kernel supports tracing most MSR (Model Specific Register) accesses. +To see the definition of the MSRs on Intel systems please see the SDM +at http://www.intel.com/sdm (Volume 3) + +Available trace points: + +/sys/kernel/debug/tracing/events/msr/ + +Trace MSR reads + +read_msr + +msr: MSR number +val: Value written +failed: 1 if the access failed, otherwise 0 + + +Trace MSR writes + +write_msr + +msr: MSR number +val: Value written +failed: 1 if the access failed, otherwise 0 + + +Trace RDPMC in kernel + +rdpmc + +The trace data can be post processed with the postprocess/decode_msr.py script + +cat /sys/kernel/debug/tracing/trace | decode_msr.py /usr/src/linux/include/asm/msr-index.h + +to add symbolic MSR names. + diff --git a/Documentation/trace/postprocess/decode_msr.py b/Documentation/trace/postprocess/decode_msr.py new file mode 100644 index 000000000..0ab40e0db --- /dev/null +++ b/Documentation/trace/postprocess/decode_msr.py @@ -0,0 +1,37 @@ +#!/usr/bin/python +# add symbolic names to read_msr / write_msr in trace +# decode_msr msr-index.h < trace +import sys +import re + +msrs = dict() + +with open(sys.argv[1] if len(sys.argv) > 1 else "msr-index.h", "r") as f: + for j in f: + m = re.match(r'#define (MSR_\w+)\s+(0x[0-9a-fA-F]+)', j) + if m: + msrs[int(m.group(2), 16)] = m.group(1) + +extra_ranges = ( + ( "MSR_LASTBRANCH_%d_FROM_IP", 0x680, 0x69F ), + ( "MSR_LASTBRANCH_%d_TO_IP", 0x6C0, 0x6DF ), + ( "LBR_INFO_%d", 0xdc0, 0xddf ), +) + +for j in sys.stdin: + m = re.search(r'(read|write)_msr:\s+([0-9a-f]+)', j) + if m: + r = None + num = int(m.group(2), 16) + if num in msrs: + r = msrs[num] + else: + for er in extra_ranges: + if er[1] <= num <= er[2]: + r = er[0] % (num - er[1],) + break + if r: + j = j.replace(" " + m.group(2), " " + r + "(" + m.group(2) + ")") + print j, + + diff --git a/Documentation/ubsan.txt b/Documentation/ubsan.txt new file mode 100644 index 000000000..f58215ef5 --- /dev/null +++ b/Documentation/ubsan.txt @@ -0,0 +1,84 @@ +Undefined Behavior Sanitizer - UBSAN + +Overview +-------- + +UBSAN is a runtime undefined behaviour checker. + +UBSAN uses compile-time instrumentation to catch undefined behavior (UB). +Compiler inserts code that perform certain kinds of checks before operations +that may cause UB. If check fails (i.e. UB detected) __ubsan_handle_* +function called to print error message. + +GCC has that feature since 4.9.x [1] (see -fsanitize=undefined option and +its suboptions). GCC 5.x has more checkers implemented [2]. + +Report example +--------------- + + ================================================================================ + UBSAN: Undefined behaviour in ../include/linux/bitops.h:110:33 + shift exponent 32 is to large for 32-bit type 'unsigned int' + CPU: 0 PID: 0 Comm: swapper Not tainted 4.4.0-rc1+ #26 + 0000000000000000 ffffffff82403cc8 ffffffff815e6cd6 0000000000000001 + ffffffff82403cf8 ffffffff82403ce0 ffffffff8163a5ed 0000000000000020 + ffffffff82403d78 ffffffff8163ac2b ffffffff815f0001 0000000000000002 + Call Trace: + [] dump_stack+0x45/0x5f + [] ubsan_epilogue+0xd/0x40 + [] __ubsan_handle_shift_out_of_bounds+0xeb/0x130 + [] ? radix_tree_gang_lookup_slot+0x51/0x150 + [] _mix_pool_bytes+0x1e6/0x480 + [] ? dmi_walk_early+0x48/0x5c + [] add_device_randomness+0x61/0x130 + [] ? dmi_save_one_device+0xaa/0xaa + [] dmi_walk_early+0x48/0x5c + [] dmi_scan_machine+0x278/0x4b4 + [] ? vprintk_default+0x1a/0x20 + [] ? early_idt_handler_array+0x120/0x120 + [] setup_arch+0x405/0xc2c + [] ? early_idt_handler_array+0x120/0x120 + [] start_kernel+0x83/0x49a + [] ? early_idt_handler_array+0x120/0x120 + [] x86_64_start_reservations+0x2a/0x2c + [] x86_64_start_kernel+0x16b/0x17a + ================================================================================ + +Usage +----- + +To enable UBSAN configure kernel with: + + CONFIG_UBSAN=y + +and to check the entire kernel: + + CONFIG_UBSAN_SANITIZE_ALL=y + +To enable instrumentation for specific files or directories, add a line +similar to the following to the respective kernel Makefile: + + For a single file (e.g. main.o): + UBSAN_SANITIZE_main.o := y + + For all files in one directory: + UBSAN_SANITIZE := y + +To exclude files from being instrumented even if +CONFIG_UBSAN_SANITIZE_ALL=y, use: + + UBSAN_SANITIZE_main.o := n + and: + UBSAN_SANITIZE := n + +Detection of unaligned accesses controlled through the separate option - +CONFIG_UBSAN_ALIGNMENT. It's off by default on architectures that support +unaligned accesses (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y). One could +still enable it in config, just note that it will produce a lot of UBSAN +reports. + +References +---------- + +[1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html +[2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html diff --git a/Documentation/usb/chipidea.txt b/Documentation/usb/chipidea.txt index 3f848c1f2..05f735a1b 100644 --- a/Documentation/usb/chipidea.txt +++ b/Documentation/usb/chipidea.txt @@ -7,8 +7,8 @@ with 2 Freescale i.MX6Q sabre SD boards. --------------------------------------- Select CONFIG_USB_OTG_FSM, rebuild kernel Image and modules. If you want to check some internal variables for otg fsm, -select CONFIG_USB_CHIPIDEA_DEBUG, there are 2 files which -can show otg fsm variables and some controller registers value: +mount debugfs, there are 2 files which can show otg fsm +variables and some controller registers value: cat /sys/kernel/debug/ci_hdrc.0/otg cat /sys/kernel/debug/ci_hdrc.0/registers diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt index b24d3ef89..581960574 100644 --- a/Documentation/usb/gadget-testing.txt +++ b/Documentation/usb/gadget-testing.txt @@ -434,7 +434,7 @@ On host: serialc -v -p -i -a1 -s1024 \ where seriald and serialc are Felipe's utilities found here: -https://git.gitorious.org/usb/usb-tools.git master +https://github.com/felipebalbi/usb-tools.git master 12. PHONET function =================== @@ -579,6 +579,8 @@ The SOURCESINK function provides these attributes in its function directory: isoc_mult - 0..2 (hs/ss only) isoc_maxburst - 0..15 (ss only) bulk_buflen - buffer length + bulk_qlen - depth of queue for bulk + iso_qlen - depth of queue for iso Testing the SOURCESINK function ------------------------------- diff --git a/Documentation/video4linux/API.html b/Documentation/video4linux/API.html index 256f8efa9..eaf948cf1 100644 --- a/Documentation/video4linux/API.html +++ b/Documentation/video4linux/API.html @@ -9,7 +9,7 @@
- V4L original API + V4L original API Obsoleted by V4L2 API diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx index 9e57ce43c..67209998a 100644 --- a/Documentation/video4linux/CARDLIST.em28xx +++ b/Documentation/video4linux/CARDLIST.em28xx @@ -41,8 +41,8 @@ 40 -> Plextor ConvertX PX-TV100U (em2861) [093b:a005] 41 -> Kworld 350 U DVB-T (em2870) [eb1a:e350] 42 -> Kworld 355 U DVB-T (em2870) [eb1a:e355,eb1a:e357,eb1a:e359] - 43 -> Terratec Cinergy T XS (em2870) [0ccd:0043] - 44 -> Terratec Cinergy T XS (MT2060) (em2870) + 43 -> Terratec Cinergy T XS (em2870) + 44 -> Terratec Cinergy T XS (MT2060) (em2870) [0ccd:0043] 45 -> Pinnacle PCTV DVB-T (em2870) 46 -> Compro, VideoMate U3 (em2870) [185b:2870] 47 -> KWorld DVB-T 305U (em2880) [eb1a:e305] diff --git a/Documentation/video4linux/fimc.txt b/Documentation/video4linux/fimc.txt index e0c6b8bc4..4fab231be 100644 --- a/Documentation/video4linux/fimc.txt +++ b/Documentation/video4linux/fimc.txt @@ -58,7 +58,7 @@ Not currently supported: 4.1. Media device interface The driver supports Media Controller API as defined at -http://linuxtv.org/downloads/v4l-dvb-apis/media_common.html +https://linuxtv.org/downloads/v4l-dvb-apis/media_common.html The media device driver name is "SAMSUNG S5P FIMC". The purpose of this interface is to allow changing assignment of FIMC instances @@ -83,11 +83,11 @@ undefined behaviour. 4.3. Capture video node The driver supports V4L2 Video Capture Interface as defined at: -http://linuxtv.org/downloads/v4l-dvb-apis/devices.html +https://linuxtv.org/downloads/v4l-dvb-apis/devices.html At the capture and mem-to-mem video nodes only the multi-planar API is supported. For more details see: -http://linuxtv.org/downloads/v4l-dvb-apis/planar-apis.html +https://linuxtv.org/downloads/v4l-dvb-apis/planar-apis.html 4.4. Camera capture subdevs diff --git a/Documentation/video4linux/omap4_camera.txt b/Documentation/video4linux/omap4_camera.txt index 25d9b40a4..a6734aa77 100644 --- a/Documentation/video4linux/omap4_camera.txt +++ b/Documentation/video4linux/omap4_camera.txt @@ -47,7 +47,7 @@ Tested platforms File list --------- drivers/staging/media/omap4iss/ -include/media/omap4iss.h +include/linux/platform_data/media/omap4iss.h References ---------- diff --git a/Documentation/video4linux/si4713.txt b/Documentation/video4linux/si4713.txt index 2e7392a4f..2ddc6b095 100644 --- a/Documentation/video4linux/si4713.txt +++ b/Documentation/video4linux/si4713.txt @@ -157,7 +157,7 @@ int main (int argc, char *argv[]) } The struct si4713_rnl and SI4713_IOC_MEASURE_RNL are defined under -include/media/si4713.h. +include/linux/platform_data/media/si4713.h. Stereo/Mono and RDS subchannels =============================== diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt index 75d5c18d6..fa41608ab 100644 --- a/Documentation/video4linux/v4l2-framework.txt +++ b/Documentation/video4linux/v4l2-framework.txt @@ -295,16 +295,16 @@ module owner. This is done for you if you use the i2c helper functions. If integration with the media framework is needed, you must initialize the media_entity struct embedded in the v4l2_subdev struct (entity field) by -calling media_entity_init(): +calling media_entity_pads_init(), if the entity has pads: struct media_pad *pads = &my_sd->pads; int err; - err = media_entity_init(&sd->entity, npads, pads, 0); + err = media_entity_pads_init(&sd->entity, npads, pads); The pads array must have been previously initialized. There is no need to -manually set the struct media_entity type and name fields, but the revision -field must be initialized if needed. +manually set the struct media_entity function and name fields, but the +revision field must be initialized if needed. A reference to the entity will be automatically acquired/released when the subdev device node (if any) is opened/closed. @@ -695,12 +695,12 @@ difference is that the inode argument is omitted since it is never used. If integration with the media framework is needed, you must initialize the media_entity struct embedded in the video_device struct (entity field) by -calling media_entity_init(): +calling media_entity_pads_init(): struct media_pad *pad = &my_vdev->pad; int err; - err = media_entity_init(&vdev->entity, 1, pad, 0); + err = media_entity_pads_init(&vdev->entity, 1, pad); The pads array must have been previously initialized. There is no need to manually set the struct media_entity type and name fields. diff --git a/Documentation/video4linux/v4l2-pci-skeleton.c b/Documentation/video4linux/v4l2-pci-skeleton.c index 95ae82860..79af0c041 100644 --- a/Documentation/video4linux/v4l2-pci-skeleton.c +++ b/Documentation/video4linux/v4l2-pci-skeleton.c @@ -163,11 +163,10 @@ static irqreturn_t skeleton_irq(int irq, void *dev_id) * minimum number: many DMA engines need a minimum of 2 buffers in the * queue and you need to have another available for userspace processing. */ -static int queue_setup(struct vb2_queue *vq, const void *parg, +static int queue_setup(struct vb2_queue *vq, unsigned int *nbuffers, unsigned int *nplanes, unsigned int sizes[], void *alloc_ctxs[]) { - const struct v4l2_format *fmt = parg; struct skeleton *skel = vb2_get_drv_priv(vq); skel->field = skel->format.field; @@ -183,12 +182,12 @@ static int queue_setup(struct vb2_queue *vq, const void *parg, if (vq->num_buffers + *nbuffers < 3) *nbuffers = 3 - vq->num_buffers; + alloc_ctxs[0] = skel->alloc_ctx; - if (fmt && fmt->fmt.pix.sizeimage < skel->format.sizeimage) - return -EINVAL; + if (*nplanes) + return sizes[0] < skel->format.sizeimage ? -EINVAL : 0; *nplanes = 1; - sizes[0] = fmt ? fmt->fmt.pix.sizeimage : skel->format.sizeimage; - alloc_ctxs[0] = skel->alloc_ctx; + sizes[0] = skel->format.sizeimage; return 0; } @@ -509,7 +508,7 @@ static int skeleton_s_dv_timings(struct file *file, void *_fh, return -EINVAL; /* Return 0 if the new timings are the same as the current timings. */ - if (v4l2_match_dv_timings(timings, &skel->timings, 0)) + if (v4l2_match_dv_timings(timings, &skel->timings, 0, false)) return 0; /* diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 092ee9fba..07e4cdf02 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1451,6 +1451,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing_irqchip irqchip; struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; + struct kvm_irq_routing_hv_sint hv_sint; __u32 pad[8]; } u; }; @@ -1459,6 +1460,7 @@ struct kvm_irq_routing_entry { #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 +#define KVM_IRQ_ROUTING_HV_SINT 4 No flags are specified so far, the corresponding field must be set to zero. @@ -1482,6 +1484,10 @@ struct kvm_irq_routing_s390_adapter { __u32 adapter_id; }; +struct kvm_irq_routing_hv_sint { + __u32 vcpu; + __u32 sint; +}; 4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated) @@ -3019,7 +3025,7 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), which is the maximum number of possibly pending cpu-local interrupts. -4.90 KVM_SMI +4.96 KVM_SMI Capability: KVM_CAP_X86_SMM Architectures: x86 @@ -3331,6 +3337,28 @@ the userspace IOAPIC should process the EOI and retrigger the interrupt if it is still asserted. Vector is the LAPIC interrupt vector for which the EOI was received. + struct kvm_hyperv_exit { +#define KVM_EXIT_HYPERV_SYNIC 1 + __u32 type; + union { + struct { + __u32 msr; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + } u; + }; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; +Indicates that the VCPU exits into userspace to process some tasks +related to Hyper-V emulation. +Valid values for 'type' are: + KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about +Hyper-V SynIC state change. Notification is used to remap SynIC +event/message pages and to enable/disable SynIC messages/events processing +in userspace. + /* Fix the size of the union. */ char padding[256]; }; @@ -3685,3 +3713,16 @@ available, means that that the kernel has an implementation of the H_RANDOM hypercall backed by a hardware random-number generator. If present, the kernel H_RANDOM handler can be enabled for guest use with the KVM_CAP_PPC_ENABLE_HCALL capability. + +8.2 KVM_CAP_HYPERV_SYNIC + +Architectures: x86 +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is +used to support Windows Hyper-V based guest paravirt drivers(VMBus). + +In order to use SynIC, it has to be activated by setting this +capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this +will disable the use of APIC hardware virtualization even if supported +by the CPU, as it's incompatible with SynIC auto-EOI behavior. diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index 2d09d1ed8..f083a168e 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -37,7 +37,8 @@ Returns: -EFAULT if the given address is not accessible Allows userspace to query the actual limit and set a new limit for the maximum guest memory size. The limit will be rounded up to 2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by -the number of page table levels. +the number of page table levels. In the case that there is no limit we will set +the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). 2. GROUP: KVM_S390_VM_CPU_MODEL Architectures: s390 diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 3a4d681c3..c81731096 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -203,10 +203,10 @@ Shadow pages contain the following information: page cannot be destroyed. See role.invalid. parent_ptes: The reverse mapping for the pte/ptes pointing at this page's spt. If - parent_ptes bit 0 is zero, only one spte points at this pages and + parent_ptes bit 0 is zero, only one spte points at this page and parent_ptes points at this single spte, otherwise, there exists multiple sptes pointing at this page and (parent_ptes & ~0x1) points at a data - structure with a list of parent_ptes. + structure with a list of parent sptes. unsync: If true, then the translations in this page may not match the guest's translation. This is equivalent to the state of the tlb when a pte is @@ -358,7 +358,8 @@ In the first case there are two additional complications: - if CR4.SMEP is enabled: since we've turned the page into a kernel page, the kernel may now execute it. We handle this by also setting spte.nx. If we get a user fetch or read fault, we'll change spte.u=1 and - spte.nx=gpte.nx back. + spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when + shadow paging is in use. - if CR4.SMAP is disabled: since the page has been changed to a kernel page, it can not be reused when CR4.SMAP is enabled. We set CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 699d8ea5c..f0d340959 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -8,7 +8,7 @@ SLUB can enable debugging only for selected slabs in order to avoid an impact on overall system performance which may make a bug more difficult to find. -In order to switch debugging on one can add a option "slub_debug" +In order to switch debugging on one can add an option "slub_debug" to the kernel command line. That will enable full debugging for all slabs. diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 8a282687e..21cf34f3d 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -35,10 +35,10 @@ miss is going to run faster. == Design == -- "graceful fallback": mm components which don't have transparent - hugepage knowledge fall back to breaking a transparent hugepage and - working on the regular pages and their respective regular pmd/pte - mappings +- "graceful fallback": mm components which don't have transparent hugepage + knowledge fall back to breaking huge pmd mapping into table of ptes and, + if necessary, split a transparent hugepage. Therefore these components + can continue working on the regular pages or regular pte mappings. - if a hugepage allocation fails because of memory fragmentation, regular pages should be gracefully allocated instead and mixed in @@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range of pages that should be collapsed into one huge page but failed the allocation. -thp_split is incremented every time a huge page is split into base +thp_split_page is incremented every time a huge page is split into base pages. This can happen for a variety of reasons but a common reason is that a huge page is old and is being reclaimed. + This action implies splitting all PMD the page mapped with. + +thp_split_page_failed is is incremented if kernel fails to split huge + page. This can happen if the page was pinned by somebody. + +thp_split_pmd is incremented every time a PMD split into table of PTEs. + This can happen, for instance, when application calls mprotect() or + munmap() on part of huge page. It doesn't split huge page, only + page table entry. thp_zero_page_alloc is incremented every time a huge zero page is successfully allocated. It includes allocations which where @@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But if any driver is going to mangle over the page structure of the tail page (like for checking page->mapping or other bits that are relevant for the head page and not the tail page), it should be updated to jump -to check head page instead (while serializing properly against -split_huge_page() to avoid the head and tail pages to disappear from -under it, see the futex code to see an example of that, hugetlbfs also -needed special handling in futex code for similar reasons). +to check head page instead. Taking reference on any head/tail page would +prevent page from being split by anyone. NOTE: these aren't new constraints to the GUP API, and they match the same constrains that applies to hugetlbfs too, so any driver capable @@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual. == Graceful fallback == Code walking pagetables but unware about huge pmds can simply call -split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by +split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by pmd_offset. It's trivial to make the code transparent hugepage aware -by just grepping for "pmd_offset" and adding split_huge_page_pmd where +by just grepping for "pmd_offset" and adding split_huge_pmd where missing after pmd_offset returns the pmd. Thanks to the graceful fallback design, with a one liner change, you can avoid to write hundred if not thousand of lines of complex code to make your code @@ -323,7 +330,8 @@ hugepage aware. If you're not walking pagetables but you run into a physical hugepage but you can't handle it natively in your code, you can split it by calling split_huge_page(page). This is what the Linux VM does before -it tries to swapout the hugepage for example. +it tries to swapout the hugepage for example. split_huge_page() can fail +if the page is pinned and you must handle this correctly. Example to make mremap.c transparent hugepage aware with a one liner change: @@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c return NULL; pmd = pmd_offset(pud, addr); -+ split_huge_page_pmd(vma, addr, pmd); ++ split_huge_pmd(vma, pmd, addr); if (pmd_none_or_clear_bad(pmd)) return NULL; == Locking in hugepage aware code == We want as much code as possible hugepage aware, as calling -split_huge_page() or split_huge_page_pmd() has a cost. +split_huge_page() or split_huge_pmd() has a cost. To make pagetable walks huge pmd aware, all you need to do is to call pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the @@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page takes the mmap_sem in write mode in addition to the anon_vma lock). If pmd_trans_huge returns false, you just fallback in the old code paths. If instead pmd_trans_huge returns true, you have to take the -mm->page_table_lock and re-run pmd_trans_huge. Taking the -page_table_lock will prevent the huge pmd to be converted into a -regular pmd from under you (split_huge_page can run in parallel to the +page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the +page table lock will prevent the huge pmd to be converted into a +regular pmd from under you (split_huge_pmd can run in parallel to the pagetable walk). If the second pmd_trans_huge returns false, you -should just drop the page_table_lock and fallback to the old code as -before. Otherwise you should run pmd_trans_splitting on the pmd. In -case pmd_trans_splitting returns true, it means split_huge_page is -already in the middle of splitting the page. So if pmd_trans_splitting -returns true it's enough to drop the page_table_lock and call -wait_split_huge_page and then fallback the old code paths. You are -guaranteed by the time wait_split_huge_page returns, the pmd isn't -huge anymore. If pmd_trans_splitting returns false, you can proceed to -process the huge pmd and the hugepage natively. Once finished you can -drop the page_table_lock. - -== compound_lock, get_user_pages and put_page == +should just drop the page table lock and fallback to the old code as +before. Otherwise you can proceed to process the huge pmd and the +hugepage natively. Once finished you can drop the page table lock. + +== Refcounts and transparent huge pages == + +Refcounting on THP is mostly consistent with refcounting on other compound +pages: + + - get_page()/put_page() and GUP operate in head page's ->_count. + + - ->_count in tail pages is always zero: get_page_unless_zero() never + succeed on tail pages. + + - map/unmap of the pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page. + + - map/unmap of the whole compound page accounted in compound_mapcount + (stored in first tail page). + +PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one. +This additional reference is required to get race-free detection of unmap of +subpages when we have them mapped with both PMDs and PTEs. + +This is optimization required to lower overhead of per-subpage mapcount +tracking. The alternative is alter ->_mapcount in all subpages on each +map/unmap of the whole compound page. + +We set PG_double_map when a PMD of the page got split for the first time, +but still have PMD mapping. The addtional references go away with last +compound_mapcount. split_huge_page internally has to distribute the refcounts in the head -page to the tail pages before clearing all PG_head/tail bits from the -page structures. It can do that easily for refcounts taken by huge pmd -mappings. But the GUI API as created by hugetlbfs (that returns head -and tail pages if running get_user_pages on an address backed by any -hugepage), requires the refcount to be accounted on the tail pages and -not only in the head pages, if we want to be able to run -split_huge_page while there are gup pins established on any tail -page. Failure to be able to run split_huge_page if there's any gup pin -on any tail page, would mean having to split all hugepages upfront in -get_user_pages which is unacceptable as too many gup users are -performance critical and they must work natively on hugepages like -they work natively on hugetlbfs already (hugetlbfs is simpler because -hugetlbfs pages cannot be split so there wouldn't be requirement of -accounting the pins on the tail pages for hugetlbfs). If we wouldn't -account the gup refcounts on the tail pages during gup, we won't know -anymore which tail page is pinned by gup and which is not while we run -split_huge_page. But we still have to add the gup pin to the head page -too, to know when we can free the compound page in case it's never -split during its lifetime. That requires changing not just -get_page, but put_page as well so that when put_page runs on a tail -page (and only on a tail page) it will find its respective head page, -and then it will decrease the head page refcount in addition to the -tail page refcount. To obtain a head page reliably and to decrease its -refcount without race conditions, put_page has to serialize against -__split_huge_page_refcount using a special per-page lock called -compound_lock. +page to the tail pages before clearing all PG_head/tail bits from the page +structures. It can be done easily for refcounts taken by page table +entries. But we don't have enough information on how to distribute any +additional pins (i.e. from get_user_pages). split_huge_page() fails any +requests to split pinned huge page: it expects page count to be equal to +sum of mapcount of all sub-pages plus one (split_huge_page caller must +have reference for head page). + +split_huge_page uses migration entries to stabilize page->_count and +page->_mapcount. + +We safe against physical memory scanners too: the only legitimate way +scanner can get reference to a page is get_page_unless_zero(). + +All tail pages has zero ->_count until atomic_add(). It prevent scanner +from geting reference to tail page up to the point. After the atomic_add() +we don't care about ->_count value. We already known how many references +with should uncharge from head page. + +For head page get_page_unless_zero() will succeed and we don't mind. It's +clear where reference should go after split: it will stay on head page. + +Note that split_huge_pmd() doesn't have any limitation on refcounting: +pmd can be split at any point and never fails. + +== Partial unmap and deferred_split_huge_page() == + +Unmapping part of THP (with munmap() or other way) is not going to free +memory immediately. Instead, we detect that a subpage of THP is not in use +in page_remove_rmap() and queue the THP for splitting if memory pressure +comes. Splitting will free up unused subpages. + +Splitting the page right away is not an option due to locking context in +the place where we can detect partial unmap. It's also might be +counterproductive since in many cases partial unmap unmap happens during +exit(2) if an THP crosses VMA boundary. + +Function deferred_split_huge_page() is used to queue page for splitting. +The splitting itself will happen when we get memory pressure via shrinker +interface. diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.txt index d8b0d3367..55120a055 100644 --- a/Documentation/watchdog/watchdog-kernel-api.txt +++ b/Documentation/watchdog/watchdog-kernel-api.txt @@ -44,17 +44,18 @@ The watchdog device structure looks like this: struct watchdog_device { int id; - struct cdev cdev; - struct device *dev; struct device *parent; + const struct attribute_group **groups; const struct watchdog_info *info; const struct watchdog_ops *ops; unsigned int bootstatus; unsigned int timeout; unsigned int min_timeout; unsigned int max_timeout; + struct notifier_block reboot_nb; + struct notifier_block restart_nb; void *driver_data; - struct mutex lock; + struct watchdog_core_data *wd_data; unsigned long status; struct list_head deferred; }; @@ -64,27 +65,32 @@ It contains following fields: /dev/watchdog0 cdev (dynamic major, minor 0) as well as the old /dev/watchdog miscdev. The id is set automatically when calling watchdog_register_device. -* cdev: cdev for the dynamic /dev/watchdog device nodes. This - field is also populated by watchdog_register_device. -* dev: device under the watchdog class (created by watchdog_register_device). * parent: set this to the parent device (or NULL) before calling watchdog_register_device. +* groups: List of sysfs attribute groups to create when creating the watchdog + device. * info: a pointer to a watchdog_info structure. This structure gives some additional information about the watchdog timer itself. (Like it's unique name) * ops: a pointer to the list of watchdog operations that the watchdog supports. * timeout: the watchdog timer's timeout value (in seconds). * min_timeout: the watchdog timer's minimum timeout value (in seconds). * max_timeout: the watchdog timer's maximum timeout value (in seconds). +* reboot_nb: notifier block that is registered for reboot notifications, for + internal use only. If the driver calls watchdog_stop_on_reboot, watchdog core + will stop the watchdog on such notifications. +* restart_nb: notifier block that is registered for machine restart, for + internal use only. If a watchdog is capable of restarting the machine, it + should define ops->restart. Priority can be changed through + watchdog_set_restart_priority. * bootstatus: status of the device after booting (reported with watchdog WDIOF_* status bits). * driver_data: a pointer to the drivers private data of a watchdog device. This data should only be accessed via the watchdog_set_drvdata and watchdog_get_drvdata routines. -* lock: Mutex for WatchDog Timer Driver Core internal use only. +* wd_data: a pointer to watchdog core internal data. * status: this field contains a number of status bits that give extra information about the status of the device (Like: is the watchdog timer - running/active, is the nowayout bit set, is the device opened via - the /dev/watchdog interface or not, ...). + running/active, or is the nowayout bit set). * deferred: entry in wtd_deferred_reg_list which is used to register early initialized watchdogs. @@ -100,8 +106,9 @@ struct watchdog_ops { unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); unsigned int (*get_timeleft)(struct watchdog_device *); - void (*ref)(struct watchdog_device *); - void (*unref)(struct watchdog_device *); + int (*restart)(struct watchdog_device *); + void (*ref)(struct watchdog_device *) __deprecated; + void (*unref)(struct watchdog_device *) __deprecated; long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long); }; @@ -110,20 +117,6 @@ driver's operations. This module owner will be used to lock the module when the watchdog is active. (This to avoid a system crash when you unload the module and /dev/watchdog is still open). -If the watchdog_device struct is dynamically allocated, just locking the module -is not enough and a driver also needs to define the ref and unref operations to -ensure the structure holding the watchdog_device does not go away. - -The simplest (and usually sufficient) implementation of this is to: -1) Add a kref struct to the same structure which is holding the watchdog_device -2) Define a release callback for the kref which frees the struct holding both -3) Call kref_init on this kref *before* calling watchdog_register_device() -4) Define a ref operation calling kref_get on this kref -5) Define a unref operation calling kref_put on this kref -6) When it is time to cleanup: - * Do not kfree() the struct holding both, the last kref_put will do this! - * *After* calling watchdog_unregister_device() call kref_put on the kref - Some operations are mandatory and some are optional. The mandatory operations are: * start: this is a pointer to the routine that starts the watchdog timer @@ -164,34 +157,23 @@ they are supported. These optional routines/operations are: (Note: the WDIOF_SETTIMEOUT needs to be set in the options field of the watchdog's info structure). * get_timeleft: this routines returns the time that's left before a reset. -* ref: the operation that calls kref_get on the kref of a dynamically - allocated watchdog_device struct. -* unref: the operation that calls kref_put on the kref of a dynamically - allocated watchdog_device struct. +* restart: this routine restarts the machine. It returns 0 on success or a + negative errno code for failure. * ioctl: if this routine is present then it will be called first before we do our own internal ioctl call handling. This routine should return -ENOIOCTLCMD if a command is not supported. The parameters that are passed to the ioctl call are: watchdog_device, cmd and arg. +The 'ref' and 'unref' operations are no longer used and deprecated. + The status bits should (preferably) be set with the set_bit and clear_bit alike bit-operations. The status bits that are defined are: * WDOG_ACTIVE: this status bit indicates whether or not a watchdog timer device is active or not. When the watchdog is active after booting, then you should set this status bit (Note: when you register the watchdog timer device with this bit set, then opening /dev/watchdog will skip the start operation) -* WDOG_DEV_OPEN: this status bit shows whether or not the watchdog device - was opened via /dev/watchdog. - (This bit should only be used by the WatchDog Timer Driver Core). -* WDOG_ALLOW_RELEASE: this bit stores whether or not the magic close character - has been sent (so that we can support the magic close feature). - (This bit should only be used by the WatchDog Timer Driver Core). * WDOG_NO_WAY_OUT: this bit stores the nowayout setting for the watchdog. If this bit is set then the watchdog timer will not be able to stop. -* WDOG_UNREGISTERED: this bit gets set by the WatchDog Timer Driver Core - after calling watchdog_unregister_device, and then checked before calling - any watchdog_ops, so that you can be sure that no operations (other then - unref) will get called after unregister, even if userspace still holds a - reference to /dev/watchdog To set the WDOG_NO_WAY_OUT status bit (before registering your watchdog timer device) you can either: @@ -231,3 +213,18 @@ the device tree (if the module timeout parameter is invalid). Best practice is to set the default timeout value as timeout value in the watchdog_device and then use this function to set the user "preferred" timeout value. This routine returns zero on success and a negative errno code for failure. + +To disable the watchdog on reboot, the user must call the following helper: + +static inline void watchdog_stop_on_reboot(struct watchdog_device *wdd); + +To change the priority of the restart handler the following helper should be +used: + +void watchdog_set_restart_priority(struct watchdog_device *wdd, int priority); + +User should follow the following guidelines for setting the priority: +* 0: should be called in last resort, has limited restart capabilities +* 128: default restart handler, use if no other handler is expected to be + available, and/or if restart is sufficient to restart the entire system +* 255: highest priority, will preempt all other restart handlers diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt index 9f9ec9f76..4e4b6f10d 100644 --- a/Documentation/watchdog/watchdog-parameters.txt +++ b/Documentation/watchdog/watchdog-parameters.txt @@ -400,3 +400,7 @@ wm8350_wdt: nowayout: Watchdog cannot be stopped once started (default=kernel config parameter) ------------------------------------------------- +sun4v_wdt: +timeout_ms: Watchdog timeout in milliseconds 1..180000, default=60000) +nowayout: Watchdog cannot be stopped once started +------------------------------------------------- diff --git a/Documentation/zh_CN/video4linux/v4l2-framework.txt b/Documentation/zh_CN/video4linux/v4l2-framework.txt index 2b828e631..698660b7f 100644 --- a/Documentation/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/zh_CN/video4linux/v4l2-framework.txt @@ -289,13 +289,13 @@ struct v4l2_subdev_ops { 然后,你必须用一个唯一的名字初始化 subdev->name,并初始化模块的 owner 域。若使用 i2c 辅助函数,这些都会帮你处理好。 -若需同媒体框架整合,你必须调用 media_entity_init() 初始化 v4l2_subdev +若需同媒体框架整合,你必须调用 media_entity_pads_init() 初始化 v4l2_subdev 结构体中的 media_entity 结构体(entity 域): struct media_pad *pads = &my_sd->pads; int err; - err = media_entity_init(&sd->entity, npads, pads, 0); + err = media_entity_pads_init(&sd->entity, npads, pads); pads 数组必须预先初始化。无须手动设置 media_entity 的 type 和 name 域,但如有必要,revision 域必须初始化。 @@ -596,13 +596,13 @@ void v4l2_disable_ioctl(struct video_device *vdev, unsigned int cmd); v4l2_file_operations 结构体是 file_operations 的一个子集。其主要 区别在于:因 inode 参数从未被使用,它将被忽略。 -如果需要与媒体框架整合,你必须通过调用 media_entity_init() 初始化 +如果需要与媒体框架整合,你必须通过调用 media_entity_pads_init() 初始化 嵌入在 video_device 结构体中的 media_entity(entity 域)结构体: struct media_pad *pad = &my_vdev->pad; int err; - err = media_entity_init(&vdev->entity, 1, pad, 0); + err = media_entity_pads_init(&vdev->entity, 1, pad); pads 数组必须预先初始化。没有必要手动设置 media_entity 的 type 和 name 域。 -- cgit v1.2.3-54-g00ecf