Version:

Example Site Configurations

This is a collection of some example SalvusFlow site configurations for a few real sites around the world. The configurations originate either from us or some users who shared them.

Please keep in mind that we cannot continuously test these so they might be slightly out of date but nonetheless they can still serve as guidance to configure your own sites.

Slurm Example Sites

Piz Daint, CSCS, Switzerland

https://www.cscs.ch/computers/piz-daint/

GPU Partition

Copy
[sites.piz_daint]
    site_type = "slurm"
    default_ranks = 12
    max_ranks = 10000
    salvus_binary = "/users/{USER}/Salvus/bin/salvus"
    run_directory = "/scratch/snx3000/{USER}/salvus_flow/run"
    tmp_directory = "/scratch/snx3000/{USER}/salvus_flow/tmp"
    # The following line enables GPU support on Piz daint
    # Note that, CUDA kernels for adjoint simulations are not included in
    # the current release, and you need to set the value to `false` if you
    # want to compute gradients.
    use_cuda_capable_gpus = true
    [[sites.piz_daint.environment_variable]]
        name = "CRAY_CUDA_MPS"
        value = "1"
    # The cray-mpich-abi module does not set the LD_LIBRARY_PATH
    # so it has to be done manually. In case this no longer works, you can find
    # the correct path by looking for the CRAY_LD_LIBRARY_PATH setting in
    # `module show cray-mpich-abi`.
    [[sites.piz_daint.environment_variable]]
        name = "LD_LIBRARY_PATH"
        value = "/opt/cray/pe/mpt/7.7.15/gni/mpich-gnu-abi/8.2/lib"
    [sites.piz_daint.ssh_settings]
        hostname = "daint.cscs.ch"
        username = "{USER}"
    [sites.piz_daint.site_specific]
        tasks_per_node = 12
        partition = "normal"
        debug_partition = "debug"
        path_to_slurm_binaries = "/usr/bin"
        # Nov 2019: This is a workaround for a bug in the slurm version the
        # CSCS deployed. They are aware of it and are working on a fix.
        omit_default_srun_arguments = true
        # These are account/project dependent!
        [[sites.piz_daint.site_specific.additional_sbatch_arguments]]
            name = "constraint"
            value = "gpu"
        [[sites.piz_daint.site_specific.additional_sbatch_arguments]]
            name = "account"
            value = "{ACCOUNT}"
        # You have to switch two modules. The first switch is just because the
        # default module is incompatible with the second module we want to
        # load.
        [[sites.piz_daint.site_specific.modules_to_switch]]
            old = "PrgEnv-cray"
            new = "PrgEnv-gnu"
        # Load an ABI compatible MPI module for Salvus to use.
        [[sites.piz_daint.site_specific.modules_to_switch]]
            old = "cray-mpich"
            new = "cray-mpich-abi"

Multicore Partition

[sites.piz_daint_mc]
    site_type = "slurm"
    default_ranks = 12
    max_ranks = 10000
    salvus_binary = "/users/{USER}/Salvus/bin/salvus"
    run_directory = "/scratch/snx3000/{USER}/salvus_flow/run"
    tmp_directory = "/scratch/snx3000/{USER}/salvus_flow/tmp"
    # The cray-mpich-abi module does not set the LD_LIBRARY_PATH
    # so it has to be done manually. In case this no longer works, you can find
    # the correct path by looking for the CRAY_LD_LIBRARY_PATH setting in
    # `module show cray-mpich-abi`.
    [[sites.piz_daint_mc.environment_variable]]
        name = "LD_LIBRARY_PATH"
        value = "/opt/cray/pe/mpt/7.7.15/gni/mpich-gnu-abi/8.2/lib"
    [sites.piz_daint_mc.ssh_settings]
        hostname = "daint.cscs.ch"
        username = "{USER}"
    [sites.piz_daint_mc.site_specific]
        tasks_per_node = 12
        partition = "normal"
        debug_partition = "debug"
        path_to_slurm_binaries = "/usr/bin"
        # Nov 2019: This is a workaround for a bug in the slurm version the
        # CSCS deployed. They are aware of it and are working on a fix.
        omit_default_srun_arguments = true
        # These are account/project dependent!
        [[sites.piz_daint_mc.site_specific.additional_sbatch_arguments]]
            name = "constraint"
            value = "mc"
        [[sites.piz_daint_mc.site_specific.additional_sbatch_arguments]]
            name = "account"
            value = "{ACCOUNT}"
        # You have to switch two modules. The first switch is just because the
        # default module is incompatible with the second module we want to
        # load.
        [[sites.piz_daint_mc.site_specific.modules_to_switch]]
            old = "PrgEnv-cray"
            new = "PrgEnv-gnu"
        # Load an ABI compatible MPI module for Salvus to use.
        [[sites.piz_daint_mc.site_specific.modules_to_switch]]
            old = "cray-mpich"
            new = "cray-mpich-abi"

Eejit, Geosciences, University of Utrecht, Netherlands

[sites.eejit]
    site_type = "slurm"
    default_ranks = 48
    max_ranks = 960
    salvus_binary = "/quanta1/home/{USER}/Salvus/bin/salvus"
    run_directory = "/scratch/{USER}/SalvusFlow/run"
    tmp_directory = "/scratch/{USER}/SalvusFlow/tmp"
    [sites.eejit.ssh_settings]
        hostname = "eejit.geo.uu.nl"
        username = "{USER}"
    [sites.eejit.site_specific]
        tasks_per_node = 48
        partition = "gpu"
        path_to_slurm_binaries = "/usr/bin"
        omit_default_srun_arguments = true
        replace_srun_with = "mpirun"
        modules_to_load = ["intel-mpi/64/2018.0.128"]
        [[sites.eejit.site_specific.additional_srun_arguments]]
            name = "n"
            value = "$SLURM_NTASKS"

PBS Example Sites

Gadi, NCI, Australia

https://nci.org.au/our-systems/hpc-systems

All values pre-and postfixed with an underscore must be replaced with the user and project names.

[sites.gadi]
        site_type = "pbs"
        # This depends on the chosen Gadi queue.
        default_ranks = 48
        # Just a safety measure.
        max_ranks = 800
        # Adapt these to your folders!
        salvus_binary = "/home/_XXX_/_USER_/Salvus/bin/salvus"
        run_directory = "/scratch/_PROJECT_/_USER_/salvus_flow/run"
        tmp_directory = "/scratch/_PROJECT_/_USER_/salvus_flow/tmp"
        # The compute nodes on gadi cannot access the internet, so license
        # tokens have to be used. The jobs will hang if you don't set this to
        # true.
        use_license_tokens = true
        # Make sure the set up key based SSH authentication beforehand as
        # described in the Salvus documentation.
        [sites.gadi.ssh_settings]
            hostname = "gadi.nci.org.au"
            username = "_USER_"
        # Point to the releases folder in the chosen intel MPI module as that
        # is where the required ABI compatible MPI libraries are located.
        [[sites.gadi.environment_variable]]
            name = "LD_LIBRARY_PATH"
            value = "/apps/intel-mpi/2019.6.166/intel64/lib/release"
        [sites.gadi.site_specific]
            tasks_per_node = 48
            # Gadi has 192GB memory per node so it cannot hurt to have a bunch
            # here. The NCI starts charging more when the memory per rank is
            # over 4 GB as it then considers it a memory dominated request.
            memory_per_rank_in_mb = 2048
            # Normal queue for most things, express for init-site.
            queue = "normal"
            debug_queue = "express"
            compute_resources_template = "ncpus={RANKS}"
            path_to_pbs_binaries = "/opt/pbs/default/bin"
            # Use the mpirun from the intel-mpi module.
            replace_pbsrun_with = "mpirun"
            # Salvus requires an ABI compatible MPI.
            modules_to_load = ["intel-mpi/2019.6.166"]
            # Storage and project have to explicitly passed. If you run salvus
            # elsewhere you might to add some other folder here.
            [[sites.gadi.site_specific.additional_qsub_arguments]]
                name = "l"
                value = "storage=scratch/_PROJECT_"
            [[sites.gadi.site_specific.additional_qsub_arguments]]
                name = "P"
                value = "_PROJECT_"

LFS Example Sites

Euler, ETHZ, Zurich, Switzerland

https://scicomp.ethz.ch/wiki/Euler

[sites.euler]
    # Euler uses IBM's LSF system.
    site_type = "lsf"
    # Node size.
    default_ranks = 24
    # Specify this depending on your allowance on euler.
    max_ranks = 240
    # This is just the default place at which the downloader will put Salvus.
    salvus_binary = "/cluster/home/{USER}/Salvus/bin/salvus"
    # As always - make sure these directories are fine for you. You might have
    # access to more suitable folders on Euler, depending on your project but
    # every Euler user has access to these.
    run_directory = "/cluster/home/{USER}/salvus_flow_run"
    tmp_directory = "/cluster/scratch/{USER}/salvus_flow_temp"
    # Compute nodes don't have internet access, so use license tokens.
    use_license_tokens = true
    [sites.euler.ssh_settings]
        hostname = "euler.ethz.ch"
        username = "{USER}"
    [sites.euler.site_specific]
        # First load the `new` module which gives access to a lot more modules.
        # Then load Intel's MPI which is an ABI compatible MPI that works with
        # Salvus.
        modules_to_load = ['new', 'impi']
        # `fullnode` is a special resource specifier on Euler which grants
        # exclusive access to a single node. If not given jobs from other
        # users might run on the same site.
        #
        # This configuration sets it to only be set for jobs with 18 or more
        # ranks.
        [[sites.euler.site_specific.additional_bsub_argument]]
            name = "R"
            value = "fullnode"
            condition = "ranks >= 18"
        # Euler by defaults only grants 1024 MB of RAM per rank. This setting
        # doubles that.
        [[sites.euler.site_specific.additional_bsub_argument]]
            name = "R"
            value = "rusage[mem=2048]"

GLIC, GFZ, Potsdam, Germany

The German Research Centre for Geosciences (GFZ) maintains a high-performance compute cluster for its staff. This Salvus configuration has been used on it.

Please note that this example uses the Mondaic provided MPI, thus make sure to also download it when running the Mondaic downloader.

[sites.glic]
    site_type = "lsf"
    default_ranks = 12
    max_ranks = 120
    # Path to the downloaded binary.
    salvus_binary = "/home/{USER}/Salvus/bin/salvus"
    run_directory = "/home/{USER}/salvus_data/run"
    tmp_directory = "/home/{USER}/salvus_data/tmp"
    # The LSF binaries are not on its default path. Thus, set the path
    # here. The actual location is likely to change in the future.
    [[sites.glic.environment_variable]]
        name = "PATH"
        value = "/lsf/10.1/linux3.10-glibc2.17-x86_64/bin:$PATH"
    # LSF requires a few environment variables which are not set when not
    # logging in in a shell. Thus they are set here. Please note that the
    # actual location for these might also change with time. To figure the
    # current values out just log-in to GLIC and query them.
    [[sites.glic.environment_variable]]
        name = "LSF_ENVDIR"
        value = "/lsf/conf"
    [[sites.glic.environment_variable]]
        name = "LSF_SERVERDIR"
        value = "/lsf/10.1/linux3.10-glibc2.17-x86_64/etc"
    # This example use the Mondaic provided MPI. Thus point to its libraries.
    [[sites.glic.environment_variable]]
        name = "LD_LIBRARY_PATH"
        value = "/home/{USER}/Salvus/lib"
    # SSH settings.
    [sites.glic.ssh_settings]
        hostname = "{HOSTNAME}"
        username = "{USER}"
    # LSF specific settings.
    [sites.glic.site_specific]
        # Use Mondaic's provided MPI as it seem to work fine here.
        replace_mpirun_with = "/home/{USER}/Salvus/bin/mpirun"
        # Use your group's queue.
        queue = "{QUEUE}"
        # Load the latest intel module and tell Salvus to load the module in
        # the job submission file and not before.
        modules_to_load = ["intel/intel-2019u3"]
        change_modules_in_batch_file = true
PAGE CONTENTS