{ pkgs, config, lib, ... }:

with lib;

let
  cfg = config.networking.namespaces;

  containerOpts = { name, ... }: {
    options = {
      config = mkOption {
        description = ''
          A specification of the desired configuration of this
          container, as a NixOS module.
        '';
        type = let
          confPkgs = if config.pkgs == null then pkgs else config.pkgs;
        in mkOptionType {
          name = "Toplevel NixOS config";
          merge = loc: defs: (import (pkgs.path + "/nixos/lib/eval-config.nix") {
            inherit (config.nixpkgs.localSystem) system;
            inherit pkgs;
            baseModules = import (pkgs.path + "/nixos/modules/module-list.nix");
            inherit (pkgs) lib;
            modules =
              let
                extraConfig = {
                  _file = "module at ${__curPos.file}:${toString __curPos.line}";
                  config = {
                    boot.isContainer = true;
                    networking.hostName = mkDefault name;
                    system.stateVersion = config.system.nixos.release; # No state
                  };
                };
              in [ extraConfig ] ++ (map (x: x.value) defs);
            prefix = [ "containers" "upstream" ];
          }).config;
        };
      };

      netns = mkOption {
        example = "upstream";
        type = types.str;
        description = "Name of network namespace to put the container in.";
      };
    };

    config = {
      netns = mkDefault name;
    };
  };

  mkContainerService = containerName: containerCfg: nameValuePair "netns-container@${containerName}" {
    after = ["network.target" "systemd-udevd.service" "systemd-sysctl.service" "netns@${containerCfg.netns}.service"];
    bindsTo = ["netns@${containerCfg.netns}.service"];
    before = ["shutdown.target"];
    wants = ["network.target"];
    conflicts = ["shutdown.target"];

    path = with pkgs; [ iproute2 config.systemd.package ];

    serviceConfig = {
      SyslogIdentifier = "netns container ${containerName}";
      Type = "notify";

      RestartForceExitStatus = "133";
      SuccessExitStatus = "133";

      Restart = "no";

      DevicePolicy = "closed";

      RuntimeDirectory = ["netns-containers/${containerName}"];
    };
    unitConfig = {
      ConditionCapability = ["CAP_SYS_TTY_CONFIG" "CAP_NET_ADMIN" "CAP_NET_RAW" "CAP_SYS_ADMIN"];
    };

    script = let
      containerInit = pkgs.writeScript "container-init" ''
          #!${pkgs.runtimeShell} -e
          exec "$1"
        '';
    in ''
      mkdir -p -m 0755 "''${RUNTIME_DIRECTORY}/etc" "''${RUNTIME_DIRECTORY}/var/lib"
      mkdir -p -m 0700 "''${RUNTIME_DIRECTORY}/var/lib/private" "''${RUNTIME_DIRECTORY}/root" /run/containers
      if ! [ -e "''${RUNTIME_DIRECTORY}/etc/os-release" ]; then
        touch "''${RUNTIME_DIRECTORY}/etc/os-release"
      fi
      if ! [ -e "''${RUNTIME_DIRECTORY}/etc/machine-id" ]; then
        touch "''${RUNTIME_DIRECTORY}/etc/machine-id"
      fi
      mkdir -p -m 0755 \
        "/nix/var/nix/profiles/per-container/${containerName}" \
        "/nix/var/nix/gcroots/per-container/${containerName}"
      credsBind=()
      if [ -n "''${CREDENTIALS_DIRECTORY}" ]; then
        while IFS= read -r -d $'\0' credFile; do
          credsBind+=("--load-credential=$(basename "''${credFile}"):''${credFile}")
        done < <(find ''${CREDENTIALS_DIRECTORY} -type f -print0)
      fi
      # Run systemd-nspawn without startup notification (we'll
      # wait for the container systemd to signal readiness).
      exec ${config.systemd.package}/bin/systemd-nspawn \
        --keep-unit \
        -M "${containerName}" -D "''${RUNTIME_DIRECTORY}" \
        --notify-ready=yes \
        --bind-ro=/nix/store \
        --bind-ro=/nix/var/nix/db \
        --bind-ro=/nix/var/nix/daemon-socket \
        ''${credsBind} \
        --bind="/nix/var/nix/profiles/per-container/${containerName}:/nix/var/nix/profiles" \
        --bind="/nix/var/nix/gcroots/per-container/${containerName}:/nix/var/nix/gcroots" \
        --setenv PATH="$PATH" \
        --capability=CAP_SYS_TTY_CONFIG,CAP_NET_ADMIN,CAP_NET_RAW,CAP_SYS_ADMIN \
        --ephemeral \
        --network-namespace-path=/run/netns/${containerCfg.netns} \
        ${containerInit} "${containerCfg.config.system.build.toplevel}/init"
    '';
  };
in {
  options = {
    networking.namespaces = {
      enable = mkEnableOption "netns@ service template";

      containers = mkOption {
        default = {};
        type = types.attrsOf (types.submodule containerOpts);
        description = ''
          A set of NixOS system configurations to be run as lightweight
          containers.  Each container appears as a service
          `container-«name»`
          on the host system, allowing it to be started and stopped via
          {command}`systemctl`.
        '';
      };
    };
  };

  config = {
    assertions = [
      { assertion = cfg.containers != {} -> cfg.enable; message = "netns containers require netns@ service template"; }
    ];

    systemd.services = {
      "netns@" = mkIf cfg.enable {
        description = "%I network namspace";
        before = [ "network-pre.target" ];
        wants = [ "network-pre.target" ];
        path = with pkgs; [ iproute2 util-linux ];
        serviceConfig = {
          Type = "oneshot";
          RemainAfterExit = true;
          PrivateNetwork = true;
          ExecStart = "${pkgs.writers.writeDash "netns-up" ''
            ip netns add "$1"
            umount /var/run/netns/"$1"
            mount --bind /proc/self/ns/net /var/run/netns/"$1"
          ''} %I";
          ExecStop = "${pkgs.iproute2}/bin/ip netns del %I";
        };
      };
    } // mapAttrs' mkContainerService cfg.containers;
  };
}