一、数据预处理 1 2 3 4 5 6 import  pandas as  pddf=pd.read_csv('file_name.csv' )	 df=pd.read_excel('path' , sheetname = 'sheet1' , header = 0 , names = ['第一列' ,'第二列' ,'第三列' ]) data = pd.read_csv('path' ,sep = ',' , header = 0 , names = ['第一列' ,'第二列' ,'第三列' ], encoding = 'utf-8' ) data = pd.read_table('path' , sep = '\t' , header = None , names = ['第一列' ,'第二列' ,'第三列' ])	 
 
数据清洗: 删除重复: 1 2 3 df=df.drop_duplicates()	 df=df.dropna()	 df=df.dropna(axis=1 )	 
 
填充丢失值: 1 2 3 4 5 df=df.fillna(df.mean())	 df=df.fillna(df.median())	 df=df.fillna(df.mode().iloc[0 ])	 df=df.fillna(method='ffill' )	 df=df.fillna(method='bfill' )	 
 
数据转换: 归一化&标准化: 调整特征的数值范围[0,1],或者正态分布(均值0,标准差1)
1 2 3 4 5 6 7 8 9 10 from  sklearn.preprocessing import  MinMaxScaler, StandardScalerdata={'ages' =[20 ,30 ,40 ,50 ]} df=pd.DataFrame(data) min_max_scaler=MinMaxScaler()	 standard_sacler=StandardScaler() df['ages_normalized' ]=min_max_scaler.fit_transformer(df[['ages' ]]) df['ages_standardized' ]=standard_sacler.fit_transformer(df[['ages' ]]) 
 
独热编码: 分类变量→数字
1 2 3 4 data={'颜色' :['红色' ,'绿色' ,'蓝色' ,'红色' ,'绿色' ,'蓝色' ]} df=pd.DataFrame(data) df=pd.get_dummies(df,columns=['颜色' ])	 
 
读取文件目录: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import  osimport  numpy as  np   base_path="TIMI/TEST/"  with  open ("test.scp" ,'wt' ,encoding='utf-8' ) as  f:  for  root,dirs,files in  os.walk(base_path):     for  file in  files:       file_name=os.path.join(root,file)              if  file_name.endswitch(".WAV" ):         print (file_name)         f.write("%s\n" %file_name)