Description: FIX CVE-2018-10995
 Fix security issue caused by mishandling user names (aka user_name
 fields) and group ids (aka gid fields). This patch was adapted from the
 changes of the 17.02 upstream branch 
Author: Gennaro Oliva <oliva.g@na.icar.cnr.it>
Bug-Debian: https://bugs.debian.org/900548
Origin: https://github.com/SchedMD/slurm/commit/df545955e4f119974c278bff0c47155257d5afc7
Last-Update: 2018-06-14

--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -191,6 +191,8 @@
 	ESLURM_BURST_BUFFER_WAIT =			2100,
 	ESLURM_PARTITION_DOWN,
 
+	ESLURM_GROUP_ID_MISSING =			2113,
+
 	/* switch specific error codes, specific values defined in plugin module */
 	ESLURM_SWITCH_MIN = 3000,
 	ESLURM_SWITCH_MAX = 3099,
--- a/src/api/step_launch.c
+++ b/src/api/step_launch.c
@@ -106,24 +106,7 @@
 static uid_t  slurm_uid;
 static bool   force_terminated_job = false;
 static int    task_exit_signal = 0;
-#ifdef HAVE_NATIVE_CRAY
-/* On a Cray we need to validate the gid
- * before the launch of the tasks.  Since a native
- * Cray really isn't a cluster but a distributed system this should
- * be ok.
- * This could be hacked by a user, but the only damage they
- * could really do is set SLURM_USER_NAME to be something
- * other than the actual name.  Running any getpwXXX commands
- * on a cray compute node is not scalable and could
- * potentially cause all sorts of issues and timeouts when
- * talking with LDAP or NIS when done on the compute node.  We
- * have not seen this issue on a regular cluster, so we do
- * the validating there instead when not on a Cray.
- */
-static bool   validate_gid = true;
-#else
 static bool   validate_gid = false;
-#endif
 static void _exec_prog(slurm_msg_t *msg);
 static int  _msg_thr_create(struct step_launch_state *sls, int num_nodes);
 static void _handle_msg(void *arg, slurm_msg_t *msg);
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -329,6 +329,8 @@
 	  "Waiting for burst buffer"				},
 	{ ESLURM_PARTITION_DOWN,
 	  "Partition in DOWN state"				},
+	{ ESLURM_GROUP_ID_MISSING,
+	  "Invalid group id"					},
 
 	/* slurmd error codes */
 	{ ESLRUMD_PIPE_ERROR_ON_TASK_SPAWN,
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -7970,7 +7970,7 @@
 	}
 	if ( job_desc_msg->group_id == NO_VAL ) {
 		debug("_validate_job_desc: job failed to specify group");
-		job_desc_msg->group_id = 0;	/* uses user default */
+		return ESLURM_GROUP_ID_MISSING;
 	}
 	if (job_desc_msg->contiguous == (uint16_t) NO_VAL)
 		job_desc_msg->contiguous = 0;
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -75,6 +75,7 @@
 #include "src/common/slurm_protocol_interface.h"
 #include "src/common/slurm_topology.h"
 #include "src/common/switch.h"
+#include "src/common/uid.h"
 #include "src/common/xstring.h"
 #include "src/common/layouts_mgr.h"
 
@@ -973,6 +974,8 @@
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred,
 					 slurmctld_config.auth_info);
+	gid_t gid = g_slurm_auth_get_gid(msg->auth_cred,
+					 slurmctld_config.auth_info);
 	int immediate = job_desc_msg->immediate;
 	bool do_unlock = false;
 	bool job_waiting = false;
@@ -992,6 +995,18 @@
 		error("Security violation, RESOURCE_ALLOCATE from uid=%d",
 		      uid);
 	}
+	if ((gid != job_desc_msg->group_id) && (!validate_super_user(uid))) {
+		char *user_name = NULL;
+		/* check if it is a valid extended gid instead */
+		if (!slurm_valid_uid_gid(job_desc_msg->user_id,
+					 &job_desc_msg->group_id,
+					 &user_name, false, true)) {
+			error_code = ESLURM_GROUP_ID_MISSING;
+			error("Security violation, RESOURCE_ALLOCATE from uid=%u who is not in gid=%u",
+			      job_desc_msg->user_id, job_desc_msg->group_id);
+		}
+		xfree(user_name);
+	}
 	debug2("sched: Processing RPC: REQUEST_RESOURCE_ALLOCATION from uid=%d",
 	       uid);
 
@@ -3302,6 +3317,8 @@
 		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred,
 					 slurmctld_config.auth_info);
+	gid_t gid = g_slurm_auth_get_gid(msg->auth_cred,
+					 slurmctld_config.auth_info);
 	char *err_msg = NULL;
 	bool reject_job = false;
 
@@ -3318,6 +3335,18 @@
 		error_code = ESLURM_USER_ID_MISSING;
 		error("Security violation, SUBMIT_JOB from uid=%d", uid);
 	}
+	if ((gid != job_desc_msg->group_id) && (!validate_super_user(uid))) {
+		char *user_name = NULL;
+		/* check if it is a valid extended gid instead */
+		if (!slurm_valid_uid_gid(job_desc_msg->user_id,
+					 &job_desc_msg->group_id,
+					 &user_name, false, true)) {
+			error_code = ESLURM_GROUP_ID_MISSING;
+			error("Security violation, SUBMIT_JOB from uid=%u who is not in gid=%u",
+			      job_desc_msg->user_id, job_desc_msg->group_id);
+		}
+		xfree(user_name);
+	}
 	if ((job_desc_msg->alloc_node == NULL) ||
 	    (job_desc_msg->alloc_node[0] == '\0')) {
 		error_code = ESLURM_INVALID_NODE_NAME;
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -482,7 +482,6 @@
 	int len = 0;
 	Buf buffer = NULL;
 	slurm_msg_t msg;
-	uid_t uid = (uid_t)-1;
 	gid_t gid = (uid_t)-1;
 	gids_t *gids = NULL;
 
@@ -491,8 +490,6 @@
 	char *parent_alias = NULL;
 	char *user_name = NULL;
 	slurm_addr_t parent_addr = {0};
-	char pwd_buffer[PW_BUF_SIZE];
-	struct passwd pwd, *pwd_result;
 
 	slurm_msg_t_init(&msg);
 	/* send type over to slurmstepd */
@@ -617,18 +614,11 @@
 	switch(type) {
 	case LAUNCH_BATCH_JOB:
 		gid = (uid_t)((batch_job_launch_msg_t *)req)->gid;
-		uid = (uid_t)((batch_job_launch_msg_t *)req)->uid;
 		user_name = ((batch_job_launch_msg_t *)req)->user_name;
 		msg.msg_type = REQUEST_BATCH_JOB_LAUNCH;
 		break;
 	case LAUNCH_TASKS:
-		/*
-		 * The validity of req->uid was verified against the
-		 * auth credential in _rpc_launch_tasks().  req->gid
-		 * has NOT yet been checked!
-		 */
 		gid = (uid_t)((launch_tasks_request_msg_t *)req)->gid;
-		uid = (uid_t)((launch_tasks_request_msg_t *)req)->uid;
 		user_name = ((launch_tasks_request_msg_t *)req)->user_name;
 		msg.msg_type = REQUEST_LAUNCH_TASKS;
 		break;
@@ -655,44 +645,6 @@
 	free_buf(buffer);
 	buffer = NULL;
 
-#ifdef HAVE_NATIVE_CRAY
-	/* Try to avoid calling this on a system which is a native
-	 * cray.  getpwuid_r is slow on the compute nodes and this has
-	 * in theory been verified earlier.
-	 */
-	if (!user_name) {
-#endif
-		/* send cached group ids array for the relevant uid */
-		debug3("_send_slurmstepd_init: call to getpwuid_r");
-		if (slurm_getpwuid_r(uid, &pwd, pwd_buffer, PW_BUF_SIZE,
-				     &pwd_result) || (pwd_result == NULL)) {
-			error("%s: getpwuid_r: %m", __func__);
-			len = 0;
-			safe_write(fd, &len, sizeof(int));
-			errno = ESLURMD_UID_NOT_FOUND;
-			return errno;
-		}
-		debug3("%s: return from getpwuid_r", __func__);
-		if (gid != pwd_result->pw_gid) {
-			debug("%s: Changing gid from %d to %d",
-			      __func__, gid, pwd_result->pw_gid);
-		}
-		gid = pwd_result->pw_gid;
-		if (!user_name)
-			user_name = pwd_result->pw_name;
-#ifdef HAVE_NATIVE_CRAY
-	}
-#endif
-	if (!user_name) {
-		/* Sanity check since gids_cache_lookup will fail
-		 * with a NULL. */
-		error("%s: No user name for %d: %m", __func__, uid);
-		len = 0;
-		safe_write(fd, &len, sizeof(int));
-		errno = ESLURMD_UID_NOT_FOUND;
-		return errno;
-	}
-
 	if ((gids = _gids_cache_lookup(user_name, gid))) {
 		int i;
 		uint32_t tmp32;
@@ -1216,6 +1168,7 @@
 	uint16_t port;
 	char     host[MAXHOSTNAMELEN];
 	uid_t    req_uid;
+	gid_t    req_gid;
 	launch_tasks_request_msg_t *req = msg->data;
 	bool     super_user = false;
 #ifndef HAVE_FRONT_END
@@ -1231,6 +1184,7 @@
 	nodeid = nodelist_find(req->complete_nodelist, conf->node_name);
 #endif
 	req_uid = g_slurm_auth_get_uid(msg->auth_cred, conf->auth_info);
+	req_gid = g_slurm_auth_get_gid(msg->auth_cred, conf->auth_info);
 	memcpy(&req->orig_addr, &msg->orig_addr, sizeof(slurm_addr_t));
 
 	super_user = _slurm_authorized_user(req_uid);
@@ -1242,6 +1196,21 @@
 		goto done;
 	}
 
+	/* cannot trust it, remove and overwrite */
+	xfree(req->user_name);
+	req->user_name = uid_to_string(req->uid);
+
+	if (!super_user && (req_gid != req->gid)) {
+		if (!slurm_valid_uid_gid(req->uid, &req->gid, &req->user_name,
+					 false, true)) {
+			error("%s: user %u does not belong to group %u, "
+			      "rejecting job %u", __func__, req->uid,
+			      req->gid, req->job_id);
+			errnum = ESLURM_GROUP_ID_MISSING;
+			goto done;
+		}
+	}
+
 	slurm_get_ip_str(cli, &port, host, sizeof(host));
 	info("launch task %u.%u request from %u.%u@%s (port %hu)", req->job_id,
 	     req->job_step_id, req->uid, req->gid, host, port);
@@ -1850,6 +1819,7 @@
 	launch_req->tasks_to_launch	= xmalloc(sizeof(uint16_t)
 						  * req->nnodes);
 	launch_req->uid			= req->uid;
+	launch_req->user_name		= req->user_name;
 
 	for (i = 0; i < req->nnodes; i++) {
 		uint32_t *tmp32 = xmalloc(sizeof(uint32_t));
@@ -1903,6 +1873,9 @@
 		return;
 	}
 
+	if (!req->user_name)
+		req->user_name = uid_to_string(req->uid);
+
 	if (slurm_send_rc_msg(msg, rc) < 0) {
 		error("Error starting prolog: %m");
 	}
@@ -2026,6 +1999,9 @@
 
 	task_g_slurmd_batch_request(req->job_id, req);	/* determine task affinity */
 
+	if (!req->user_name)
+		req->user_name = uid_to_string(req->uid);
+
 	slurm_mutex_lock(&prolog_mutex);
 	first_job_run = !slurm_cred_jobid_cached(conf->vctx, req->job_id);
 
@@ -3804,6 +3780,9 @@
 	if ((rc != SLURM_SUCCESS) && !_slurm_authorized_user(key.uid))
 		return rc;
 
+	/* cannot trust, so discard */
+	xfree(req->user_name);
+
 #if 0
 	info("last_block=%u force=%u modes=%o",
 	     req->last_block, req->force, req->modes);
@@ -3947,6 +3926,10 @@
 	pid_t child;
 	file_bcast_info_t *file_info;
 
+	/* cannot trust, discard and overwrite */
+	xfree(req->user_name);
+	req->user_name = uid_to_string(key->uid);
+
 	if (!(gids = _gids_cache_lookup(req->user_name, key->gid))) {
 		error("sbcast: gids_cache_lookup for %s failed", req->user_name);
 		return SLURM_ERROR;
@@ -5570,7 +5553,6 @@
 _build_env(job_env_t *job_env)
 {
 	char **env = xmalloc(sizeof(char *));
-	bool user_name_set = 0;
 
 	env[0]  = NULL;
 	if (!valid_spank_job_env(job_env->spank_job_env,
@@ -5592,17 +5574,13 @@
 	setenvf(&env, "SLURM_JOB_ID", "%u", job_env->jobid);
 	setenvf(&env, "SLURM_JOB_UID",   "%u", job_env->uid);
 
-#ifndef HAVE_NATIVE_CRAY
-	/* uid_to_string on a cray is a heavy call, so try to avoid it */
-	if (!job_env->user_name) {
-		job_env->user_name = uid_to_string(job_env->uid);
-		user_name_set = 1;
+	if (job_env->user_name) {
+		setenvf(&env, "SLURM_JOB_USER", "%s", job_env->user_name);
+	} else {
+		char *user_name = uid_to_string(job_env->uid);
+		setenvf(&env, "SLURM_JOB_USER", "%s", user_name);
+		xfree(user_name);
 	}
-#endif
-
-	setenvf(&env, "SLURM_JOB_USER", "%s", job_env->user_name);
-	if (user_name_set)
-		xfree(job_env->user_name);
 
 	setenvf(&env, "SLURM_JOBID", "%u", job_env->jobid);
 	setenvf(&env, "SLURM_UID",   "%u", job_env->uid);
--- a/src/slurmd/slurmstepd/slurmstepd_job.c
+++ b/src/slurmd/slurmstepd/slurmstepd_job.c
@@ -70,12 +70,6 @@
 #include "src/slurmd/slurmstepd/multi_prog.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
 
-#ifdef HAVE_NATIVE_CRAY
-static bool already_validated_uid = true;
-#else
-static bool already_validated_uid = false;
-#endif
-
 static char ** _array_copy(int n, char **src);
 static void _array_free(char ***array);
 static void _srun_info_destructor(void *arg);
@@ -245,10 +239,6 @@
 	xassert(msg->complete_nodelist != NULL);
 	debug3("entering stepd_step_rec_create");
 
-	if (!slurm_valid_uid_gid((uid_t)msg->uid, &(msg->gid),
-				 &(msg->user_name), already_validated_uid, 1))
-		return NULL;
-
 	if (acct_gather_check_acct_freq_task(msg->job_mem_lim, msg->acctg_freq))
 		return NULL;
 
@@ -445,10 +435,6 @@
 
 	debug3("entering batch_stepd_step_rec_create");
 
-	if (!slurm_valid_uid_gid((uid_t)msg->uid, &(msg->gid),
-				 &(msg->user_name), already_validated_uid, 1))
-		return NULL;
-
 	if (acct_gather_check_acct_freq_task(msg->job_mem, msg->acctg_freq))
 		return NULL;
 
