[docs]classBaseDataset(object):"""The internal base dataset implementation. Args: root (str): The root directory where datasets stored. dataset_name (str): The name of the dataset. fraction (float): The fraction of the data chosen from the raw data to use. num_of_clients (int): The targeted number of clients to construct. split_type (str): The type of statistical simulation, options: iid, dir, and class. `iid` means independent and identically distributed data. `niid` means non-independent and identically distributed data for Femnist and Shakespeare. `dir` means using Dirichlet process to simulate non-iid data, for CIFAR-10 and CIFAR-100 datasets. `class` means partitioning the dataset by label classes, for datasets like CIFAR-10, CIFAR-100. minsample (int): The minimal number of samples in each client. It is applicable for LEAF datasets and dir simulation of CIFAR-10 and CIFAR-100. class_per_client (int): The number of classes in each client. Only applicable when the split_type is 'class'. iid_user_fraction (float): The fraction of the number of clients used when the split_type is 'iid'. user (bool): A flag to indicate whether partition users of the dataset into train-test groups. Only applicable to LEAF datasets. True means partitioning users of the dataset into train-test groups. False means partitioning each users' samples into train-test groups. train_test_split (float): The fraction of data for training; the rest are for testing. e.g., 0.9 means 90% of data are used for training and 10% are used for testing. num_class: The number of classes in this dataset. seed: Random seed. """def__init__(self,root,dataset_name,fraction,split_type,user,iid_user_fraction,train_test_split,minsample,num_class,num_of_client,class_per_client,setting_folder,seed=-1,**kwargs):# file_path = os.path.dirname(os.path.realpath(__file__))# self.base_folder = os.path.join(os.path.dirname(file_path), "data", dataset_name)self.base_folder=rootself.dataset_name=dataset_nameself.fraction=fractionself.split_type=split_type# iid, niid, classself.user=userself.iid_user_fraction=iid_user_fractionself.train_test_split=train_test_splitself.minsample=minsampleself.num_class=num_classself.num_of_client=num_of_clientself.class_per_client=class_per_clientself.seed=seedifsplit_type=="iid":assertself.user==Falseself.iid=Trueelifsplit_type=="niid":# if niid, user can be either True or Falseself.iid=Falseself.setting_folder=setting_folderself.data_folder=os.path.join(self.base_folder,self.setting_folder)@abstractmethoddefdownload_packaged_dataset_and_extract(self,filename):raiseNotImplementedError("download_packaged_dataset_and_extract not implemented")@abstractmethoddefdownload_raw_file_and_extract(self):raiseNotImplementedError("download_raw_file_and_extract not implemented")@abstractmethoddefpreprocess(self):raiseNotImplementedError("preprocess not implemented")@abstractmethoddefconvert_data_to_json(self):raiseNotImplementedError("convert_data_to_json not implemented")@staticmethoddefget_setting_folder(dataset,split_type,num_of_client,min_size,class_per_client,fraction,iid_fraction,user_str,train_test_split,alpha=None,weights=None):ifdataset==CIFAR10ordataset==CIFAR100:return"{}_{}_{}_{}_{}_{}_{}".format(dataset,split_type,num_of_client,min_size,class_per_client,alpha,1ifweightselse0)else:return"{}_{}_{}_{}_{}_{}_{}_{}_{}".format(dataset,split_type,num_of_client,min_size,class_per_client,fraction,iid_fraction,user_str,train_test_split)defsetup(self):self.download_raw_file_and_extract()self.preprocess()self.convert_data_to_json()defsample_customized(self):meta_folder=os.path.join(self.base_folder,"meta")ifnotos.path.exists(meta_folder):os.makedirs(meta_folder)sample_folder=os.path.join(self.data_folder,"sampled_data")ifnotos.path.exists(sample_folder):os.makedirs(sample_folder)ifnotos.listdir(sample_folder):sample(self.base_folder,self.data_folder,meta_folder,self.fraction,self.iid,self.iid_user_fraction,self.seed)defsample_extreme(self):meta_folder=os.path.join(self.base_folder,"meta")ifnotos.path.exists(meta_folder):os.makedirs(meta_folder)sample_folder=os.path.join(self.data_folder,"sampled_data")ifnotos.path.exists(sample_folder):os.makedirs(sample_folder)ifnotos.listdir(sample_folder):extreme(self.base_folder,self.data_folder,meta_folder,self.fraction,self.num_class,self.num_of_client,self.class_per_client,self.seed)defremove_unqualified_user(self):rm_folder=os.path.join(self.data_folder,"rem_user_data")ifnotos.path.exists(rm_folder):os.makedirs(rm_folder)ifnotos.listdir(rm_folder):remove(self.data_folder,self.dataset_name,self.minsample)defsplit_train_test_set(self):meta_folder=os.path.join(self.base_folder,"meta")train=os.path.join(self.data_folder,"train")ifnotos.path.exists(train):os.makedirs(train)test=os.path.join(self.data_folder,"test")ifnotos.path.exists(test):os.makedirs(test)ifnotos.listdir(train)andnotos.listdir(test):split_train_test(self.data_folder,meta_folder,self.dataset_name,self.user,self.train_test_split,self.seed)defsampling(self):ifself.split_type=="iid":self.sample_customized()elifself.split_type=="niid":self.sample_customized()elifself.split_type=="class":self.sample_extreme()self.remove_unqualified_user()self.split_train_test_set()