[docs]defsetup(port=23344):"""Setup distributed settings of slurm. Args: port (int, optional): The port of the primary server. It respectively auto-increments by 1 when the port is in-use. Returns: int: The rank of current process. int: The local rank of current process. int: Total number of processes. str: The address of the distributed init method. """try:rank=int(os.environ['SLURM_PROCID'])local_rank=int(os.environ['SLURM_LOCALID'])world_size=int(os.environ['SLURM_NTASKS'])host=get_ip(os.environ['SLURM_STEP_NODELIST'])whileis_port_in_use(host,port):port+=1host_addr='tcp://'+host+':'+str(port)exceptKeyError:return0,0,0,""returnrank,local_rank,world_size,host_addr
[docs]defget_ip(node_list):"""Get the ip address of nodes. Args: node_list (str): Name of the nodes. Returns: str: The first node in the nodes. """if"["notinnode_list:returnnode_listr=re.search(r'([\w-]*)\[(\d*)[-+,+\d]*\]',node_list)ifnotr:returnbase,node=r.groups()returnbase+node
defis_port_in_use(host,port):"""Check whether the port is in use. Args: host (str): Host address. port (int): Port to use. Returns: bool: A flag to indicate whether the port is in use in the host. """withsocket.socket(socket.AF_INET,socket.SOCK_STREAM)ass:returns.connect_ex((host,port))==0