Source code for pyspark.ml.deepspeed.deepspeed_distributor
## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#importjsonimportsysimporttempfilefromtypingimport(Union,Callable,List,Dict,Optional,Any,)frompyspark.ml.torch.distributorimportTorchDistributor
[docs]classDeepspeedTorchDistributor(TorchDistributor):_DEEPSPEED_SSL_CONF="deepspeed.spark.distributor.ignoreSsl"def__init__(self,numGpus:int=1,nnodes:int=1,localMode:bool=True,useGpu:bool=True,deepspeedConfig:Optional[Union[str,Dict[str,Any]]]=None,):""" This class is used to run deepspeed training workloads with spark clusters. The user has the option to specify the number of gpus per node and the number of nodes (the same as if running from terminal), as well as specify a deepspeed configuration file. Parameters ---------- numGpus: int The number of GPUs to use per node (analagous to num_gpus in deepspeed command). nnodes: int The number of nodes that should be used for the run. localMode: bool Whether or not to run the training in a distributed fashion or just locally. useGpu: bool Boolean flag to determine whether to utilize gpus. deepspeedConfig: Union[Dict[str,Any], str] or None: The configuration file to be used for launching the deepspeed application. If it's a dictionary containing the parameters, then we will create the file. If None, deepspeed will fall back to default parameters. Examples -------- Run Deepspeed training function on a single node >>> def train(learning_rate): ... import deepspeed ... # rest of training function ... return model >>> distributor = DeepspeedTorchDistributor( ... numGpus=4, ... nnodes=1, ... useGpu=True, ... localMode=True, ... deepspeedConfig="path/to/config.json") >>> output = distributor.run(train, 0.01) Run Deepspeed training function on multiple nodes >>> distributor = DeepspeedTorchDistributor( ... numGpus=4, ... nnodes=3, ... useGpu=True, ... localMode=False, ... deepspeedConfig="path/to/config.json") >>> output = distributor.run(train, 0.01) """num_processes=numGpus*nnodesself.deepspeed_config=deepspeedConfigsuper().__init__(num_processes,localMode,useGpu,_ssl_conf=DeepspeedTorchDistributor._DEEPSPEED_SSL_CONF,)self.cleanup_deepspeed_conf=False@staticmethoddef_get_deepspeed_config_path(deepspeed_config:Union[str,Dict[str,Any]])->str:ifisinstance(deepspeed_config,dict):withtempfile.NamedTemporaryFile(mode="w+",delete=False,suffix=".json")asfile:json.dump(deepspeed_config,file)returnfile.namedeepspeed_config_path=deepspeed_config# Empty value means the deepspeed will fall back to default settings.ifdeepspeed_configisNone:return""returndeepspeed_config_path@staticmethoddef_create_torchrun_command(input_params:Dict[str,Any],train_path:str,*args:Any)->List[str]:local_mode=input_params["local_mode"]num_processes=input_params["num_processes"]deepspeed_config=input_params["deepspeed_config"]deepspeed_config_path=DeepspeedTorchDistributor._get_deepspeed_config_path(deepspeed_config)torchrun_args,processes_per_node=TorchDistributor._get_torchrun_args(local_mode,num_processes)args_string=list(map(str,args))command_to_run=[sys.executable,"-m","torch.distributed.run",*torchrun_args,f"--nproc_per_node={processes_per_node}",train_path,*args_string,"--deepspeed",]# Don't have the deepspeed_config argument if no path is provided or no parameters setifdeepspeed_config_path=="":returncommand_to_runreturncommand_to_run+["--deepspeed_config",deepspeed_config_path]@staticmethoddef_run_training_on_pytorch_file(input_params:Dict[str,Any],train_path:str,*args:Any,**kwargs:Any)->None:ifkwargs:raiseValueError("DeepspeedTorchDistributor with pytorch file doesn't support keyword arguments")log_streaming_client=input_params.get("log_streaming_client",None)training_command=DeepspeedTorchDistributor._create_torchrun_command(input_params,train_path,*args)DeepspeedTorchDistributor._execute_command(training_command,log_streaming_client=log_streaming_client)
[docs]defrun(self,train_object:Union[Callable,str],*args:Any,**kwargs:Any)->Optional[Any]:# If the "train_object" is a string, then we assume it's a filepath.# Otherwise, we assume it's a function.returnself._run(train_object,DeepspeedTorchDistributor._run_training_on_pytorch_file,*args,**kwargs)