From 59914a02ccdeb88b6370d9a202f40435d5d04feb Mon Sep 17 00:00:00 2001 From: Gregor Kleen Date: Tue, 3 Aug 2021 17:49:13 +0200 Subject: stage-1: dereference secrets --- modules/stage-1/default.nix | 669 ++++++++++++++++++++++++++++++++++++++++ modules/stage-1/stage-1-init.sh | 638 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 1307 insertions(+) create mode 100644 modules/stage-1/default.nix create mode 100644 modules/stage-1/stage-1-init.sh (limited to 'modules') diff --git a/modules/stage-1/default.nix b/modules/stage-1/default.nix new file mode 100644 index 00000000..5a14584e --- /dev/null +++ b/modules/stage-1/default.nix @@ -0,0 +1,669 @@ +# This module builds the initial ramdisk, which contains an init +# script that performs the first stage of booting the system: it loads +# the modules necessary to mount the root file system, then calls the +# init in the root file system to start the second boot stage. + +{ config, lib, utils, pkgs, ... }: + +with lib; + +let + + udev = config.systemd.package; + + kernel-name = config.boot.kernelPackages.kernel.name or "kernel"; + + modulesTree = config.system.modulesTree.override { name = kernel-name + "-modules"; }; + firmware = config.hardware.firmware; + + + # Determine the set of modules that we need to mount the root FS. + modulesClosure = pkgs.makeModulesClosure { + rootModules = config.boot.initrd.availableKernelModules ++ config.boot.initrd.kernelModules; + kernel = modulesTree; + firmware = firmware; + allowMissing = false; + }; + + + # The initrd only has to mount `/` or any FS marked as necessary for + # booting (such as the FS containing `/nix/store`, or an FS needed for + # mounting `/`, like `/` on a loopback). + fileSystems = filter utils.fsNeededForBoot config.system.build.fileSystems; + + # A utility for enumerating the shared-library dependencies of a program + findLibs = pkgs.buildPackages.writeShellScriptBin "find-libs" '' + set -euo pipefail + + declare -A seen + left=() + + patchelf="${pkgs.buildPackages.patchelf}/bin/patchelf" + + function add_needed { + rpath="$($patchelf --print-rpath $1)" + dir="$(dirname $1)" + for lib in $($patchelf --print-needed $1); do + left+=("$lib" "$rpath" "$dir") + done + } + + add_needed "$1" + + while [ ''${#left[@]} -ne 0 ]; do + next=''${left[0]} + rpath=''${left[1]} + ORIGIN=''${left[2]} + left=("''${left[@]:3}") + if [ -z ''${seen[$next]+x} ]; then + seen[$next]=1 + + # Ignore the dynamic linker which for some reason appears as a DT_NEEDED of glibc but isn't in glibc's RPATH. + case "$next" in + ld*.so.?) continue;; + esac + + IFS=: read -ra paths <<< $rpath + res= + for path in "''${paths[@]}"; do + path=$(eval "echo $path") + if [ -f "$path/$next" ]; then + res="$path/$next" + echo "$res" + add_needed "$res" + break + fi + done + if [ -z "$res" ]; then + echo "Couldn't satisfy dependency $next" >&2 + exit 1 + fi + fi + done + ''; + + # Some additional utilities needed in stage 1, like mount, lvm, fsck + # etc. We don't want to bring in all of those packages, so we just + # copy what we need. Instead of using statically linked binaries, + # we just copy what we need from Glibc and use patchelf to make it + # work. + extraUtils = pkgs.runCommandCC "extra-utils" + { nativeBuildInputs = [pkgs.buildPackages.nukeReferences]; + allowedReferences = [ "out" ]; # prevent accidents like glibc being included in the initrd + } + '' + set +o pipefail + + mkdir -p $out/bin $out/lib + ln -s $out/bin $out/sbin + + copy_bin_and_libs () { + [ -f "$out/bin/$(basename $1)" ] && rm "$out/bin/$(basename $1)" + cp -pdv $1 $out/bin + } + + # Copy BusyBox. + for BIN in ${pkgs.busybox}/{s,}bin/*; do + copy_bin_and_libs $BIN + done + + # Copy some util-linux stuff. + copy_bin_and_libs ${pkgs.util-linux}/sbin/blkid + + # Copy dmsetup and lvm. + copy_bin_and_libs ${getBin pkgs.lvm2}/bin/dmsetup + copy_bin_and_libs ${getBin pkgs.lvm2}/bin/lvm + + # Add RAID mdadm tool. + copy_bin_and_libs ${pkgs.mdadm}/sbin/mdadm + copy_bin_and_libs ${pkgs.mdadm}/sbin/mdmon + + # Copy udev. + copy_bin_and_libs ${udev}/bin/udevadm + copy_bin_and_libs ${udev}/lib/systemd/systemd-sysctl + for BIN in ${udev}/lib/udev/*_id; do + copy_bin_and_libs $BIN + done + # systemd-udevd is only a symlink to udevadm these days + ln -sf udevadm $out/bin/systemd-udevd + + # Copy modprobe. + copy_bin_and_libs ${pkgs.kmod}/bin/kmod + ln -sf kmod $out/bin/modprobe + + # Copy resize2fs if any ext* filesystems are to be resized + ${optionalString (any (fs: fs.autoResize && (lib.hasPrefix "ext" fs.fsType)) fileSystems) '' + # We need mke2fs in the initrd. + copy_bin_and_libs ${pkgs.e2fsprogs}/sbin/resize2fs + ''} + + # Copy secrets if needed. + # + # TODO: move out to a separate script; see #85000. + ${optionalString (!config.boot.loader.supportsInitrdSecrets) + (concatStringsSep "\n" (mapAttrsToList (dest: source: + let source' = if source == null then dest else source; in + '' + mkdir -p $(dirname "$out/secrets/${dest}") + # Some programs (e.g. ssh) doesn't like secrets to be + # symlinks, so we use `cp -L` here to match the + # behaviour when secrets are natively supported. + cp -Lr ${source'} "$out/secrets/${dest}" + '' + ) config.boot.initrd.secrets)) + } + + ${config.boot.initrd.extraUtilsCommands} + + # Copy ld manually since it isn't detected correctly + cp -pv ${pkgs.stdenv.cc.libc.out}/lib/ld*.so.? $out/lib + + # Copy all of the needed libraries + find $out/bin $out/lib -type f | while read BIN; do + echo "Copying libs for executable $BIN" + for LIB in $(${findLibs}/bin/find-libs $BIN); do + TGT="$out/lib/$(basename $LIB)" + if [ ! -f "$TGT" ]; then + SRC="$(readlink -e $LIB)" + cp -pdv "$SRC" "$TGT" + fi + done + done + + # Strip binaries further than normal. + chmod -R u+w $out + stripDirs "$STRIP" "lib bin" "-s" + + # Run patchelf to make the programs refer to the copied libraries. + find $out/bin $out/lib -type f | while read i; do + if ! test -L $i; then + nuke-refs -e $out $i + fi + done + + find $out/bin -type f | while read i; do + if ! test -L $i; then + echo "patching $i..." + patchelf --set-interpreter $out/lib/ld*.so.? --set-rpath $out/lib $i || true + fi + done + + if [ -z "${toString (pkgs.stdenv.hostPlatform != pkgs.stdenv.buildPlatform)}" ]; then + # Make sure that the patchelf'ed binaries still work. + echo "testing patched programs..." + $out/bin/ash -c 'echo hello world' | grep "hello world" + export LD_LIBRARY_PATH=$out/lib + $out/bin/mount --help 2>&1 | grep -q "BusyBox" + $out/bin/blkid -V 2>&1 | grep -q 'libblkid' + $out/bin/udevadm --version + $out/bin/dmsetup --version 2>&1 | tee -a log | grep -q "version:" + LVM_SYSTEM_DIR=$out $out/bin/lvm version 2>&1 | tee -a log | grep -q "LVM" + $out/bin/mdadm --version + + ${config.boot.initrd.extraUtilsCommandsTest} + fi + ''; # */ + + + # Networkd link files are used early by udev to set up interfaces early. + # This must be done in stage 1 to avoid race conditions between udev and + # network daemons. + linkUnits = pkgs.runCommand "link-units" { + allowedReferences = [ extraUtils ]; + preferLocalBuild = true; + } ('' + mkdir -p $out + cp -v ${udev}/lib/systemd/network/*.link $out/ + '' + ( + let + links = filterAttrs (n: v: hasSuffix ".link" n) config.systemd.network.units; + files = mapAttrsToList (n: v: "${v.unit}/${n}") links; + in + concatMapStringsSep "\n" (file: "cp -v ${file} $out/") files + )); + + udevRules = pkgs.runCommand "udev-rules" { + allowedReferences = [ extraUtils ]; + preferLocalBuild = true; + } '' + mkdir -p $out + + echo 'ENV{LD_LIBRARY_PATH}="${extraUtils}/lib"' > $out/00-env.rules + + cp -v ${udev}/lib/udev/rules.d/60-cdrom_id.rules $out/ + cp -v ${udev}/lib/udev/rules.d/60-persistent-storage.rules $out/ + cp -v ${udev}/lib/udev/rules.d/75-net-description.rules $out/ + cp -v ${udev}/lib/udev/rules.d/80-drivers.rules $out/ + cp -v ${udev}/lib/udev/rules.d/80-net-setup-link.rules $out/ + cp -v ${pkgs.lvm2}/lib/udev/rules.d/*.rules $out/ + ${config.boot.initrd.extraUdevRulesCommands} + + for i in $out/*.rules; do + substituteInPlace $i \ + --replace ata_id ${extraUtils}/bin/ata_id \ + --replace scsi_id ${extraUtils}/bin/scsi_id \ + --replace cdrom_id ${extraUtils}/bin/cdrom_id \ + --replace ${pkgs.coreutils}/bin/basename ${extraUtils}/bin/basename \ + --replace ${pkgs.util-linux}/bin/blkid ${extraUtils}/bin/blkid \ + --replace ${getBin pkgs.lvm2}/bin ${extraUtils}/bin \ + --replace ${pkgs.mdadm}/sbin ${extraUtils}/sbin \ + --replace ${pkgs.bash}/bin/sh ${extraUtils}/bin/sh \ + --replace ${udev} ${extraUtils} + done + + # Work around a bug in QEMU, which doesn't implement the "READ + # DISC INFORMATION" SCSI command: + # https://bugzilla.redhat.com/show_bug.cgi?id=609049 + # As a result, `cdrom_id' doesn't print + # ID_CDROM_MEDIA_TRACK_COUNT_DATA, which in turn prevents the + # /dev/disk/by-label symlinks from being created. We need these + # in the NixOS installation CD, so use ID_CDROM_MEDIA in the + # corresponding udev rules for now. This was the behaviour in + # udev <= 154. See also + # http://www.spinics.net/lists/hotplug/msg03935.html + substituteInPlace $out/60-persistent-storage.rules \ + --replace ID_CDROM_MEDIA_TRACK_COUNT_DATA ID_CDROM_MEDIA + ''; # */ + + + # The init script of boot stage 1 (loading kernel modules for + # mounting the root FS). + bootStage1 = pkgs.substituteAll { + src = ./stage-1-init.sh; + + shell = "${extraUtils}/bin/ash"; + + isExecutable = true; + + postInstall = '' + echo checking syntax + # check both with bash + ${pkgs.buildPackages.bash}/bin/sh -n $target + # and with ash shell, just in case + ${pkgs.buildPackages.busybox}/bin/ash -n $target + ''; + + inherit linkUnits udevRules extraUtils modulesClosure; + + inherit (config.boot) resumeDevice; + + inherit (config.system.build) earlyMountScript; + + inherit (config.boot.initrd) checkJournalingFS verbose + preLVMCommands preDeviceCommands postDeviceCommands postMountCommands preFailCommands kernelModules; + + resumeDevices = map (sd: if sd ? device then sd.device else "/dev/disk/by-label/${sd.label}") + (filter (sd: hasPrefix "/dev/" sd.device && !sd.randomEncryption.enable + # Don't include zram devices + && !(hasPrefix "/dev/zram" sd.device) + ) config.swapDevices); + + fsInfo = + let f = fs: [ fs.mountPoint (if fs.device != null then fs.device else "/dev/disk/by-label/${fs.label}") fs.fsType (builtins.concatStringsSep "," fs.options) ]; + in pkgs.writeText "initrd-fsinfo" (concatStringsSep "\n" (concatMap f fileSystems)); + + setHostId = optionalString (config.networking.hostId != null) '' + hi="${config.networking.hostId}" + ${if pkgs.stdenv.isBigEndian then '' + echo -ne "\x''${hi:0:2}\x''${hi:2:2}\x''${hi:4:2}\x''${hi:6:2}" > /etc/hostid + '' else '' + echo -ne "\x''${hi:6:2}\x''${hi:4:2}\x''${hi:2:2}\x''${hi:0:2}" > /etc/hostid + ''} + ''; + }; + + + # The closure of the init script of boot stage 1 is what we put in + # the initial RAM disk. + initialRamdisk = pkgs.makeInitrd { + name = "initrd-${kernel-name}"; + inherit (config.boot.initrd) compressor compressorArgs prepend; + + contents = + [ { object = bootStage1; + symlink = "/init"; + } + { object = pkgs.writeText "mdadm.conf" config.boot.initrd.mdadmConf; + symlink = "/etc/mdadm.conf"; + } + { object = pkgs.runCommand "initrd-kmod-blacklist-ubuntu" { + src = "${pkgs.kmod-blacklist-ubuntu}/modprobe.conf"; + preferLocalBuild = true; + } '' + target=$out + ${pkgs.buildPackages.perl}/bin/perl -0pe 's/## file: iwlwifi.conf(.+?)##/##/s;' $src > $out + ''; + symlink = "/etc/modprobe.d/ubuntu.conf"; + } + { object = pkgs.kmod-debian-aliases; + symlink = "/etc/modprobe.d/debian.conf"; + } + ]; + }; + + # Script to add secret files to the initrd at bootloader update time + initialRamdiskSecretAppender = + let + compressorExe = initialRamdisk.compressorExecutableFunction pkgs; + in pkgs.writeScriptBin "append-initrd-secrets" + '' + #!${pkgs.bash}/bin/bash -e + function usage { + echo "USAGE: $0 INITRD_FILE" >&2 + echo "Appends this configuration's secrets to INITRD_FILE" >&2 + } + + if [ $# -ne 1 ]; then + usage + exit 1 + fi + + if [ "$1"x = "--helpx" ]; then + usage + exit 0 + fi + + ${lib.optionalString (config.boot.initrd.secrets == {}) + "exit 0"} + + export PATH=${pkgs.coreutils}/bin:${pkgs.cpio}/bin:${pkgs.gzip}/bin:${pkgs.findutils}/bin + + function cleanup { + if [ -n "$tmp" -a -d "$tmp" ]; then + rm -fR "$tmp" + fi + } + trap cleanup EXIT + + tmp=$(mktemp -d initrd-secrets.XXXXXXXXXX) + + ${lib.concatStringsSep "\n" (mapAttrsToList (dest: source: + let source' = if source == null then dest else toString source; in + '' + mkdir -p $(dirname "$tmp/${dest}") + cp -aL ${source'} "$tmp/${dest}" + '' + ) config.boot.initrd.secrets) + } + + (cd "$tmp" && find . -print0 | sort -z | cpio --quiet -o -H newc -R +0:+0 --reproducible --null) | \ + ${compressorExe} ${lib.escapeShellArgs initialRamdisk.compressorArgs} >> "$1" + ''; + +in + +{ + disabledModules = [ "system/boot/stage-1.nix" ]; + + options = { + + boot.resumeDevice = mkOption { + type = types.str; + default = ""; + example = "/dev/sda3"; + description = '' + Device for manual resume attempt during boot. This should be used primarily + if you want to resume from file. If left empty, the swap partitions are used. + Specify here the device where the file resides. + You should also use boot.kernelParams to specify + resume_offset. + ''; + }; + + boot.initrd.enable = mkOption { + type = types.bool; + default = !config.boot.isContainer; + defaultText = "!config.boot.isContainer"; + description = '' + Whether to enable the NixOS initial RAM disk (initrd). This may be + needed to perform some initialisation tasks (like mounting + network/encrypted file systems) before continuing the boot process. + ''; + }; + + boot.initrd.prepend = mkOption { + default = [ ]; + type = types.listOf types.str; + description = '' + Other initrd files to prepend to the final initrd we are building. + ''; + }; + + boot.initrd.checkJournalingFS = mkOption { + default = true; + type = types.bool; + description = '' + Whether to run fsck on journaling filesystems such as ext3. + ''; + }; + + boot.initrd.mdadmConf = mkOption { + default = ""; + type = types.lines; + description = '' + Contents of /etc/mdadm.conf in stage 1. + ''; + }; + + boot.initrd.preLVMCommands = mkOption { + default = ""; + type = types.lines; + description = '' + Shell commands to be executed immediately before LVM discovery. + ''; + }; + + boot.initrd.preDeviceCommands = mkOption { + default = ""; + type = types.lines; + description = '' + Shell commands to be executed before udev is started to create + device nodes. + ''; + }; + + boot.initrd.postDeviceCommands = mkOption { + default = ""; + type = types.lines; + description = '' + Shell commands to be executed immediately after stage 1 of the + boot has loaded kernel modules and created device nodes in + /dev. + ''; + }; + + boot.initrd.postMountCommands = mkOption { + default = ""; + type = types.lines; + description = '' + Shell commands to be executed immediately after the stage 1 + filesystems have been mounted. + ''; + }; + + boot.initrd.preFailCommands = mkOption { + default = ""; + type = types.lines; + description = '' + Shell commands to be executed before the failure prompt is shown. + ''; + }; + + boot.initrd.extraUtilsCommands = mkOption { + internal = true; + default = ""; + type = types.lines; + description = '' + Shell commands to be executed in the builder of the + extra-utils derivation. This can be used to provide + additional utilities in the initial ramdisk. + ''; + }; + + boot.initrd.extraUtilsCommandsTest = mkOption { + internal = true; + default = ""; + type = types.lines; + description = '' + Shell commands to be executed in the builder of the + extra-utils derivation after patchelf has done its + job. This can be used to test additional utilities + copied in extraUtilsCommands. + ''; + }; + + boot.initrd.extraUdevRulesCommands = mkOption { + internal = true; + default = ""; + type = types.lines; + description = '' + Shell commands to be executed in the builder of the + udev-rules derivation. This can be used to add + additional udev rules in the initial ramdisk. + ''; + }; + + boot.initrd.compressor = mkOption { + default = ( + if lib.versionAtLeast config.boot.kernelPackages.kernel.version "5.9" + then "zstd" + else "gzip" + ); + defaultText = "zstd if the kernel supports it (5.9+), gzip if not."; + type = types.unspecified; # We don't have a function type... + description = '' + The compressor to use on the initrd image. May be any of: + + + The name of one of the predefined compressors, see pkgs/build-support/kernel/initrd-compressor-meta.nix for the definitions. + A function which, given the nixpkgs package set, returns the path to a compressor tool, e.g. pkgs: "''${pkgs.pigz}/bin/pigz" + (not recommended, because it does not work when cross-compiling) the full path to a compressor tool, e.g. "''${pkgs.pigz}/bin/pigz" + + + The given program should read data from stdin and write it to stdout compressed. + ''; + example = "xz"; + }; + + boot.initrd.compressorArgs = mkOption { + default = null; + type = types.nullOr (types.listOf types.str); + description = "Arguments to pass to the compressor for the initrd image, or null to use the compressor's defaults."; + }; + + boot.initrd.secrets = mkOption + { default = {}; + type = types.attrsOf (types.nullOr types.path); + description = + '' + Secrets to append to the initrd. The attribute name is the + path the secret should have inside the initrd, the value + is the path it should be copied from (or null for the same + path inside and out). + ''; + example = literalExample + '' + { "/etc/dropbear/dropbear_rsa_host_key" = + ./secret-dropbear-key; + } + ''; + }; + + boot.initrd.supportedFilesystems = mkOption { + default = [ ]; + example = [ "btrfs" ]; + type = types.listOf types.str; + description = "Names of supported filesystem types in the initial ramdisk."; + }; + + boot.initrd.verbose = mkOption { + default = true; + type = types.bool; + description = + '' + Verbosity of the initrd. Please note that disabling verbosity removes + only the mandatory messages generated by the NixOS scripts. For a + completely silent boot, you might also want to set the two following + configuration options: + + + boot.consoleLogLevel = 0; + boot.kernelParams = [ "quiet" "udev.log_priority=3" ]; + + ''; + }; + + boot.loader.supportsInitrdSecrets = mkOption + { internal = true; + default = false; + type = types.bool; + description = + '' + Whether the bootloader setup runs append-initrd-secrets. + If not, any needed secrets must be copied into the initrd + and thus added to the store. + ''; + }; + + fileSystems = mkOption { + type = with lib.types; attrsOf (submodule { + options.neededForBoot = mkOption { + default = false; + type = types.bool; + description = '' + If set, this file system will be mounted in the initial ramdisk. + Note that the file system will always be mounted in the initial + ramdisk if its mount point is one of the following: + ${concatStringsSep ", " ( + forEach utils.pathsNeededForBoot (i: "${i}") + )}. + ''; + }; + }); + }; + + }; + + config = mkIf config.boot.initrd.enable { + assertions = [ + { assertion = any (fs: fs.mountPoint == "/") fileSystems; + message = "The ‘fileSystems’ option does not specify your root file system."; + } + { assertion = let inherit (config.boot) resumeDevice; in + resumeDevice == "" || builtins.substring 0 1 resumeDevice == "/"; + message = "boot.resumeDevice has to be an absolute path." + + " Old \"x:y\" style is no longer supported."; + } + # TODO: remove when #85000 is fixed + { assertion = !config.boot.loader.supportsInitrdSecrets -> + all (source: + builtins.isPath source || + (builtins.isString source && hasPrefix builtins.storeDir source)) + (attrValues config.boot.initrd.secrets); + message = '' + boot.loader.initrd.secrets values must be unquoted paths when + using a bootloader that doesn't natively support initrd + secrets, e.g.: + + boot.initrd.secrets = { + "/etc/secret" = /path/to/secret; + }; + + Note that this will result in all secrets being stored + world-readable in the Nix store! + ''; + } + ]; + + system.build = + { inherit bootStage1 initialRamdisk initialRamdiskSecretAppender extraUtils; }; + + system.requiredKernelConfig = with config.lib.kernelConfig; [ + (isYes "TMPFS") + (isYes "BLK_DEV_INITRD") + ]; + + boot.initrd.supportedFilesystems = map (fs: fs.fsType) fileSystems; + + }; +} diff --git a/modules/stage-1/stage-1-init.sh b/modules/stage-1/stage-1-init.sh new file mode 100644 index 00000000..ddaf9858 --- /dev/null +++ b/modules/stage-1/stage-1-init.sh @@ -0,0 +1,638 @@ +#! @shell@ + +targetRoot=/mnt-root +console=tty1 +verbose="@verbose@" + +info() { + if [[ -n "$verbose" ]]; then + echo "$@" + fi +} + +extraUtils="@extraUtils@" +export LD_LIBRARY_PATH=@extraUtils@/lib +export PATH=@extraUtils@/bin +ln -s @extraUtils@/bin /bin + +# Copy the secrets to their needed location +if [ -d "@extraUtils@/secrets" ]; then + for secret in $(cd "@extraUtils@/secrets"; find . -type f); do + mkdir -p $(dirname "/$secret") + ln -s "@extraUtils@/secrets/$secret" "$secret" + done +fi + +# Stop LVM complaining about fd3 +export LVM_SUPPRESS_FD_WARNINGS=true + +fail() { + if [ -n "$panicOnFail" ]; then exit 1; fi + + @preFailCommands@ + + # If starting stage 2 failed, allow the user to repair the problem + # in an interactive shell. + cat </dev/$console 2>/dev/$console" + elif [ -n "$allowShell" -a "$reply" = i ]; then + echo "Starting interactive shell..." + setsid @shell@ -c "exec @shell@ < /dev/$console >/dev/$console 2>/dev/$console" || fail + elif [ "$reply" = r ]; then + echo "Rebooting..." + reboot -f + else + info "Continuing..." + fi +} + +trap 'fail' 0 + + +# Print a greeting. +info +info "<<< NixOS Stage 1 >>>" +info + +# Make several required directories. +mkdir -p /etc/udev +touch /etc/fstab # to shut up mount +ln -s /proc/mounts /etc/mtab # to shut up mke2fs +touch /etc/udev/hwdb.bin # to shut up udev +touch /etc/initrd-release + +# Function for waiting a device to appear. +waitDevice() { + local device="$1" + + # USB storage devices tend to appear with some delay. It would be + # great if we had a way to synchronously wait for them, but + # alas... So just wait for a few seconds for the device to + # appear. + if test ! -e $device; then + echo -n "waiting for device $device to appear..." + try=20 + while [ $try -gt 0 ]; do + sleep 1 + # also re-try lvm activation now that new block devices might have appeared + lvm vgchange -ay + # and tell udev to create nodes for the new LVs + udevadm trigger --action=add + if test -e $device; then break; fi + echo -n "." + try=$((try - 1)) + done + echo + [ $try -ne 0 ] + fi +} + +# Mount special file systems. +specialMount() { + local device="$1" + local mountPoint="$2" + local options="$3" + local fsType="$4" + + mkdir -m 0755 -p "$mountPoint" + mount -n -t "$fsType" -o "$options" "$device" "$mountPoint" +} +source @earlyMountScript@ + +# Log the script output to /dev/kmsg or /run/log/stage-1-init.log. +mkdir -p /tmp +mkfifo /tmp/stage-1-init.log.fifo +logOutFd=8 && logErrFd=9 +eval "exec $logOutFd>&1 $logErrFd>&2" +if test -w /dev/kmsg; then + tee -i < /tmp/stage-1-init.log.fifo /proc/self/fd/"$logOutFd" | while read -r line; do + if test -n "$line"; then + echo "<7>stage-1-init: [$(date)] $line" > /dev/kmsg + fi + done & +else + mkdir -p /run/log + tee -i < /tmp/stage-1-init.log.fifo /run/log/stage-1-init.log & +fi +exec > /tmp/stage-1-init.log.fifo 2>&1 + + +# Process the kernel command line. +export stage2Init=/init +for o in $(cat /proc/cmdline); do + case $o in + console=*) + set -- $(IFS==; echo $o) + params=$2 + set -- $(IFS=,; echo $params) + console=$1 + ;; + init=*) + set -- $(IFS==; echo $o) + stage2Init=$2 + ;; + boot.persistence=*) + set -- $(IFS==; echo $o) + persistence=$2 + ;; + boot.persistence.opt=*) + set -- $(IFS==; echo $o) + persistence_opt=$2 + ;; + boot.trace|debugtrace) + # Show each command. + set -x + ;; + boot.shell_on_fail) + allowShell=1 + ;; + boot.debug1|debug1) # stop right away + allowShell=1 + fail + ;; + boot.debug1devices) # stop after loading modules and creating device nodes + allowShell=1 + debug1devices=1 + ;; + boot.debug1mounts) # stop after mounting file systems + allowShell=1 + debug1mounts=1 + ;; + boot.panic_on_fail|stage1panic=1) + panicOnFail=1 + ;; + root=*) + # If a root device is specified on the kernel command + # line, make it available through the symlink /dev/root. + # Recognise LABEL= and UUID= to support UNetbootin. + set -- $(IFS==; echo $o) + if [ $2 = "LABEL" ]; then + root="/dev/disk/by-label/$3" + elif [ $2 = "UUID" ]; then + root="/dev/disk/by-uuid/$3" + else + root=$2 + fi + ln -s "$root" /dev/root + ;; + copytoram) + copytoram=1 + ;; + findiso=*) + # if an iso name is supplied, try to find the device where + # the iso resides on + set -- $(IFS==; echo $o) + isoPath=$2 + ;; + esac +done + +# Set hostid before modules are loaded. +# This is needed by the spl/zfs modules. +@setHostId@ + +# Load the required kernel modules. +mkdir -p /lib +ln -s @modulesClosure@/lib/modules /lib/modules +ln -s @modulesClosure@/lib/firmware /lib/firmware +echo @extraUtils@/bin/modprobe > /proc/sys/kernel/modprobe +for i in @kernelModules@; do + info "loading module $(basename $i)..." + modprobe $i +done + + +# Create device nodes in /dev. +@preDeviceCommands@ +info "running udev..." +ln -sfn /proc/self/fd /dev/fd +ln -sfn /proc/self/fd/0 /dev/stdin +ln -sfn /proc/self/fd/1 /dev/stdout +ln -sfn /proc/self/fd/2 /dev/stderr +mkdir -p /etc/systemd +ln -sfn @linkUnits@ /etc/systemd/network +mkdir -p /etc/udev +ln -sfn @udevRules@ /etc/udev/rules.d +mkdir -p /dev/.mdadm +systemd-udevd --daemon +udevadm trigger --action=add +udevadm settle + + +# XXX: Use case usb->lvm will still fail, usb->luks->lvm is covered +@preLVMCommands@ + +info "starting device mapper and LVM..." +lvm vgchange -ay + +if test -n "$debug1devices"; then fail; fi + + +@postDeviceCommands@ + + +# Return true if the machine is on AC power, or if we can't determine +# whether it's on AC power. +onACPower() { + ! test -d "/proc/acpi/battery" || + ! ls /proc/acpi/battery/BAT[0-9]* > /dev/null 2>&1 || + ! cat /proc/acpi/battery/BAT*/state | grep "^charging state" | grep -q "discharg" +} + + +# Check the specified file system, if appropriate. +checkFS() { + local device="$1" + local fsType="$2" + + # Only check block devices. + if [ ! -b "$device" ]; then return 0; fi + + # Don't check ROM filesystems. + if [ "$fsType" = iso9660 -o "$fsType" = udf ]; then return 0; fi + + # Don't check resilient COWs as they validate the fs structures at mount time + if [ "$fsType" = btrfs -o "$fsType" = zfs -o "$fsType" = bcachefs ]; then return 0; fi + + # Skip fsck for nilfs2 - not needed by design and no fsck tool for this filesystem. + if [ "$fsType" = nilfs2 ]; then return 0; fi + + # Skip fsck for inherently readonly filesystems. + if [ "$fsType" = squashfs ]; then return 0; fi + + # If we couldn't figure out the FS type, then skip fsck. + if [ "$fsType" = auto ]; then + echo 'cannot check filesystem with type "auto"!' + return 0 + fi + + # Device might be already mounted manually + # e.g. NBD-device or the host filesystem of the file which contains encrypted root fs + if mount | grep -q "^$device on "; then + echo "skip checking already mounted $device" + return 0 + fi + + # Optionally, skip fsck on journaling filesystems. This option is + # a hack - it's mostly because e2fsck on ext3 takes much longer to + # recover the journal than the ext3 implementation in the kernel + # does (minutes versus seconds). + if test -z "@checkJournalingFS@" -a \ + \( "$fsType" = ext3 -o "$fsType" = ext4 -o "$fsType" = reiserfs \ + -o "$fsType" = xfs -o "$fsType" = jfs -o "$fsType" = f2fs \) + then + return 0 + fi + + # Don't run `fsck' if the machine is on battery power. !!! Is + # this a good idea? + if ! onACPower; then + echo "on battery power, so no \`fsck' will be performed on \`$device'" + return 0 + fi + + echo "checking $device..." + + fsckFlags= + if test "$fsType" != "btrfs"; then + fsckFlags="-V -a" + fi + fsck $fsckFlags "$device" + fsckResult=$? + + if test $(($fsckResult | 2)) = $fsckResult; then + echo "fsck finished, rebooting..." + sleep 3 + reboot -f + fi + + if test $(($fsckResult | 4)) = $fsckResult; then + echo "$device has unrepaired errors, please fix them manually." + fail + fi + + if test $fsckResult -ge 8; then + echo "fsck on $device failed." + fail + fi + + return 0 +} + + +# Function for mounting a file system. +mountFS() { + local device="$1" + local mountPoint="$2" + local options="$3" + local fsType="$4" + + if [ "$fsType" = auto ]; then + fsType=$(blkid -o value -s TYPE "$device") + if [ -z "$fsType" ]; then fsType=auto; fi + fi + + # Filter out x- options, which busybox doesn't do yet. + local optionsFiltered="$(IFS=,; for i in $options; do if [ "${i:0:2}" != "x-" ]; then echo -n $i,; fi; done)" + # Prefix (lower|upper|work)dir with /mnt-root (overlayfs) + local optionsPrefixed="$( echo "$optionsFiltered" | sed -E 's#\<(lowerdir|upperdir|workdir)=#\1=/mnt-root#g' )" + + echo "$device /mnt-root$mountPoint $fsType $optionsPrefixed" >> /etc/fstab + + checkFS "$device" "$fsType" + + # Optionally resize the filesystem. + case $options in + *x-nixos.autoresize*) + if [ "$fsType" = ext2 -o "$fsType" = ext3 -o "$fsType" = ext4 ]; then + modprobe "$fsType" + echo "resizing $device..." + e2fsck -fp "$device" + resize2fs "$device" + elif [ "$fsType" = f2fs ]; then + echo "resizing $device..." + fsck.f2fs -fp "$device" + resize.f2fs "$device" + fi + ;; + esac + + # Create backing directories for overlayfs + if [ "$fsType" = overlay ]; then + for i in upper work; do + dir="$( echo "$optionsPrefixed" | grep -o "${i}dir=[^,]*" )" + mkdir -m 0700 -p "${dir##*=}" + done + fi + + info "mounting $device on $mountPoint..." + + mkdir -p "/mnt-root$mountPoint" + + # For ZFS and CIFS mounts, retry a few times before giving up. + # We do this for ZFS as a workaround for issue NixOS/nixpkgs#25383. + local n=0 + while true; do + mount "/mnt-root$mountPoint" && break + if [ \( "$fsType" != cifs -a "$fsType" != zfs \) -o "$n" -ge 10 ]; then fail; break; fi + echo "retrying..." + sleep 1 + n=$((n + 1)) + done + + [ "$mountPoint" == "/" ] && + [ -f "/mnt-root/etc/NIXOS_LUSTRATE" ] && + lustrateRoot "/mnt-root" + + true +} + +lustrateRoot () { + local root="$1" + + echo + echo -e "\e[1;33m<<< NixOS is now lustrating the root filesystem (cruft goes to /old-root) >>>\e[0m" + echo + + mkdir -m 0755 -p "$root/old-root.tmp" + + echo + echo "Moving impurities out of the way:" + for d in "$root"/* + do + [ "$d" == "$root/nix" ] && continue + [ "$d" == "$root/boot" ] && continue # Don't render the system unbootable + [ "$d" == "$root/old-root.tmp" ] && continue + + mv -v "$d" "$root/old-root.tmp" + done + + # Use .tmp to make sure subsequent invokations don't clash + mv -v "$root/old-root.tmp" "$root/old-root" + + mkdir -m 0755 -p "$root/etc" + touch "$root/etc/NIXOS" + + exec 4< "$root/old-root/etc/NIXOS_LUSTRATE" + + echo + echo "Restoring selected impurities:" + while read -u 4 keeper; do + dirname="$(dirname "$keeper")" + mkdir -m 0755 -p "$root/$dirname" + cp -av "$root/old-root/$keeper" "$root/$keeper" + done + + exec 4>&- +} + + + +if test -e /sys/power/resume -a -e /sys/power/disk; then + if test -n "@resumeDevice@" && waitDevice "@resumeDevice@"; then + resumeDev="@resumeDevice@" + resumeInfo="$(udevadm info -q property "$resumeDev" )" + else + for sd in @resumeDevices@; do + # Try to detect resume device. According to Ubuntu bug: + # https://bugs.launchpad.net/ubuntu/+source/pm-utils/+bug/923326/comments/1 + # when there are multiple swap devices, we can't know where the hibernate + # image will reside. We can check all of them for swsuspend blkid. + if waitDevice "$sd"; then + resumeInfo="$(udevadm info -q property "$sd")" + if [ "$(echo "$resumeInfo" | sed -n 's/^ID_FS_TYPE=//p')" = "swsuspend" ]; then + resumeDev="$sd" + break + fi + fi + done + fi + if test -n "$resumeDev"; then + resumeMajor="$(echo "$resumeInfo" | sed -n 's/^MAJOR=//p')" + resumeMinor="$(echo "$resumeInfo" | sed -n 's/^MINOR=//p')" + echo "$resumeMajor:$resumeMinor" > /sys/power/resume 2> /dev/null || echo "failed to resume..." + fi +fi + +# If we have a path to an iso file, find the iso and link it to /dev/root +if [ -n "$isoPath" ]; then + mkdir -p /findiso + + for delay in 5 10; do + blkid | while read -r line; do + device=$(echo "$line" | sed 's/:.*//') + type=$(echo "$line" | sed 's/.*TYPE="\([^"]*\)".*/\1/') + + mount -t "$type" "$device" /findiso + if [ -e "/findiso$isoPath" ]; then + ln -sf "/findiso$isoPath" /dev/root + break 2 + else + umount /findiso + fi + done + + sleep "$delay" + done +fi + +# Try to find and mount the root device. +mkdir -p $targetRoot + +exec 3< @fsInfo@ + +while read -u 3 mountPoint; do + read -u 3 device + read -u 3 fsType + read -u 3 options + + # !!! Really quick hack to support bind mounts, i.e., where the + # "device" should be taken relative to /mnt-root, not /. Assume + # that every device that starts with / but doesn't start with /dev + # is a bind mount. + pseudoDevice= + case $device in + /dev/*) + ;; + //*) + # Don't touch SMB/CIFS paths. + pseudoDevice=1 + ;; + /*) + device=/mnt-root$device + ;; + *) + # Not an absolute path; assume that it's a pseudo-device + # like an NFS path (e.g. "server:/path"). + pseudoDevice=1 + ;; + esac + + if test -z "$pseudoDevice" && ! waitDevice "$device"; then + # If it doesn't appear, try to mount it anyway (and + # probably fail). This is a fallback for non-device "devices" + # that we don't properly recognise. + echo "Timed out waiting for device $device, trying to mount anyway." + fi + + # Wait once more for the udev queue to empty, just in case it's + # doing something with $device right now. + udevadm settle + + # If copytoram is enabled: skip mounting the ISO and copy its content to a tmpfs. + if [ -n "$copytoram" ] && [ "$device" = /dev/root ] && [ "$mountPoint" = /iso ]; then + fsType=$(blkid -o value -s TYPE "$device") + fsSize=$(blockdev --getsize64 "$device") + + mkdir -p /tmp-iso + mount -t "$fsType" /dev/root /tmp-iso + mountFS tmpfs /iso size="$fsSize" tmpfs + + cp -r /tmp-iso/* /mnt-root/iso/ + + umount /tmp-iso + rmdir /tmp-iso + continue + fi + + if [ "$mountPoint" = / ] && [ "$device" = tmpfs ] && [ ! -z "$persistence" ]; then + echo persistence... + waitDevice "$persistence" + echo enabling persistence... + mountFS "$persistence" "$mountPoint" "$persistence_opt" "auto" + continue + fi + + mountFS "$device" "$mountPoint" "$options" "$fsType" +done + +exec 3>&- + + +@postMountCommands@ + + +# Emit a udev rule for /dev/root to prevent systemd from complaining. +if [ -e /mnt-root/iso ]; then + eval $(udevadm info --export --export-prefix=ROOT_ --device-id-of-file=/mnt-root/iso) +else + eval $(udevadm info --export --export-prefix=ROOT_ --device-id-of-file=$targetRoot) +fi +if [ "$ROOT_MAJOR" -a "$ROOT_MINOR" -a "$ROOT_MAJOR" != 0 ]; then + mkdir -p /run/udev/rules.d + echo 'ACTION=="add|change", SUBSYSTEM=="block", ENV{MAJOR}=="'$ROOT_MAJOR'", ENV{MINOR}=="'$ROOT_MINOR'", SYMLINK+="root"' > /run/udev/rules.d/61-dev-root-link.rules +fi + + +# Stop udevd. +udevadm control --exit + +# Reset the logging file descriptors. +# Do this just before pkill, which will kill the tee process. +exec 1>&$logOutFd 2>&$logErrFd +eval "exec $logOutFd>&- $logErrFd>&-" + +# Kill any remaining processes, just to be sure we're not taking any +# with us into stage 2. But keep storage daemons like unionfs-fuse. +# +# Storage daemons are distinguished by an @ in front of their command line: +# https://www.freedesktop.org/wiki/Software/systemd/RootStorageDaemons/ +for pid in $(pgrep -v -f '^@'); do + # Make sure we don't kill kernel processes, see #15226 and: + # http://stackoverflow.com/questions/12213445/identifying-kernel-threads + readlink "/proc/$pid/exe" &> /dev/null || continue + # Try to avoid killing ourselves. + [ $pid -eq $$ ] && continue + kill -9 "$pid" +done + +if test -n "$debug1mounts"; then fail; fi + + +# Restore /proc/sys/kernel/modprobe to its original value. +echo /sbin/modprobe > /proc/sys/kernel/modprobe + + +# Start stage 2. `switch_root' deletes all files in the ramfs on the +# current root. The path has to be valid in the chroot not outside. +if [ ! -e "$targetRoot/$stage2Init" ]; then + stage2Check=${stage2Init} + while [ "$stage2Check" != "${stage2Check%/*}" ] && [ ! -L "$targetRoot/$stage2Check" ]; do + stage2Check=${stage2Check%/*} + done + if [ ! -L "$targetRoot/$stage2Check" ]; then + echo "stage 2 init script ($targetRoot/$stage2Init) not found" + fail + fi +fi + +mkdir -m 0755 -p $targetRoot/proc $targetRoot/sys $targetRoot/dev $targetRoot/run + +mount --move /proc $targetRoot/proc +mount --move /sys $targetRoot/sys +mount --move /dev $targetRoot/dev +mount --move /run $targetRoot/run + +exec env -i $(type -P switch_root) "$targetRoot" "$stage2Init" + +fail # should never be reached -- cgit v1.2.3