|
|
@@ -0,0 +1,397 @@ |
|
|
|
#include <float.h> |
|
|
|
#include <stdio.h> |
|
|
|
#include <stdlib.h> |
|
|
|
#include <ctype.h> |
|
|
|
#include <string.h> |
|
|
|
|
|
|
|
void exit_with_help() |
|
|
|
{ |
|
|
|
printf( |
|
|
|
"Usage: svm-scale [options] data_filename\n" |
|
|
|
"options:\n" |
|
|
|
"-l lower : x scaling lower limit (default -1)\n" |
|
|
|
"-u upper : x scaling upper limit (default +1)\n" |
|
|
|
"-y y_lower y_upper : y scaling limits (default: no y scaling)\n" |
|
|
|
"-s save_filename : save scaling parameters to save_filename\n" |
|
|
|
"-r restore_filename : restore scaling parameters from restore_filename\n" |
|
|
|
); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
char *line = NULL; |
|
|
|
int max_line_len = 1024; |
|
|
|
double lower=-1.0,upper=1.0,y_lower,y_upper; |
|
|
|
int y_scaling = 0; |
|
|
|
double *feature_max; |
|
|
|
double *feature_min; |
|
|
|
double y_max = -DBL_MAX; |
|
|
|
double y_min = DBL_MAX; |
|
|
|
int max_index; |
|
|
|
int min_index; |
|
|
|
long int num_nonzeros = 0; |
|
|
|
long int new_num_nonzeros = 0; |
|
|
|
|
|
|
|
#define max(x,y) (((x)>(y))?(x):(y)) |
|
|
|
#define min(x,y) (((x)<(y))?(x):(y)) |
|
|
|
|
|
|
|
void output_target(double value); |
|
|
|
void output(int index, double value); |
|
|
|
char* readline(FILE *input); |
|
|
|
int clean_up(FILE *fp_restore, FILE *fp, const char *msg); |
|
|
|
|
|
|
|
int main(int argc,char **argv) |
|
|
|
{ |
|
|
|
int i,index; |
|
|
|
FILE *fp, *fp_restore = NULL; |
|
|
|
char *save_filename = NULL; |
|
|
|
char *restore_filename = NULL; |
|
|
|
|
|
|
|
for(i=1;i<argc;i++) |
|
|
|
{ |
|
|
|
if(argv[i][0] != '-') break; |
|
|
|
++i; |
|
|
|
switch(argv[i-1][1]) |
|
|
|
{ |
|
|
|
case 'l': lower = atof(argv[i]); break; |
|
|
|
case 'u': upper = atof(argv[i]); break; |
|
|
|
case 'y': |
|
|
|
y_lower = atof(argv[i]); |
|
|
|
++i; |
|
|
|
y_upper = atof(argv[i]); |
|
|
|
y_scaling = 1; |
|
|
|
break; |
|
|
|
case 's': save_filename = argv[i]; break; |
|
|
|
case 'r': restore_filename = argv[i]; break; |
|
|
|
default: |
|
|
|
fprintf(stderr,"unknown option\n"); |
|
|
|
exit_with_help(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if(!(upper > lower) || (y_scaling && !(y_upper > y_lower))) |
|
|
|
{ |
|
|
|
fprintf(stderr,"inconsistent lower/upper specification\n"); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
if(restore_filename && save_filename) |
|
|
|
{ |
|
|
|
fprintf(stderr,"cannot use -r and -s simultaneously\n"); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
if(argc != i+1) |
|
|
|
exit_with_help(); |
|
|
|
|
|
|
|
fp=fopen(argv[i],"r"); |
|
|
|
|
|
|
|
if(fp==NULL) |
|
|
|
{ |
|
|
|
fprintf(stderr,"can't open file %s\n", argv[i]); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
line = (char *) malloc(max_line_len*sizeof(char)); |
|
|
|
|
|
|
|
#define SKIP_TARGET\ |
|
|
|
while(isspace(*p)) ++p;\ |
|
|
|
while(!isspace(*p)) ++p; |
|
|
|
|
|
|
|
#define SKIP_ELEMENT\ |
|
|
|
while(*p!=':') ++p;\ |
|
|
|
++p;\ |
|
|
|
while(isspace(*p)) ++p;\ |
|
|
|
while(*p && !isspace(*p)) ++p; |
|
|
|
|
|
|
|
/* assumption: min index of attributes is 1 */ |
|
|
|
/* pass 1: find out max index of attributes */ |
|
|
|
max_index = 0; |
|
|
|
min_index = 1; |
|
|
|
|
|
|
|
if(restore_filename) |
|
|
|
{ |
|
|
|
int idx, c; |
|
|
|
|
|
|
|
fp_restore = fopen(restore_filename,"r"); |
|
|
|
if(fp_restore==NULL) |
|
|
|
{ |
|
|
|
fprintf(stderr,"can't open file %s\n", restore_filename); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
c = fgetc(fp_restore); |
|
|
|
if(c == 'y') |
|
|
|
{ |
|
|
|
readline(fp_restore); |
|
|
|
readline(fp_restore); |
|
|
|
readline(fp_restore); |
|
|
|
} |
|
|
|
readline(fp_restore); |
|
|
|
readline(fp_restore); |
|
|
|
|
|
|
|
while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1) |
|
|
|
max_index = max(idx,max_index); |
|
|
|
rewind(fp_restore); |
|
|
|
} |
|
|
|
|
|
|
|
while(readline(fp)!=NULL) |
|
|
|
{ |
|
|
|
char *p=line; |
|
|
|
|
|
|
|
SKIP_TARGET |
|
|
|
|
|
|
|
while(sscanf(p,"%d:%*f",&index)==1) |
|
|
|
{ |
|
|
|
max_index = max(max_index, index); |
|
|
|
min_index = min(min_index, index); |
|
|
|
SKIP_ELEMENT |
|
|
|
num_nonzeros++; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if(min_index < 1) |
|
|
|
fprintf(stderr, |
|
|
|
"WARNING: minimal feature index is %d, but indices should start from 1\n", min_index); |
|
|
|
|
|
|
|
rewind(fp); |
|
|
|
|
|
|
|
feature_max = (double *)malloc((max_index+1)* sizeof(double)); |
|
|
|
feature_min = (double *)malloc((max_index+1)* sizeof(double)); |
|
|
|
|
|
|
|
if(feature_max == NULL || feature_min == NULL) |
|
|
|
{ |
|
|
|
fprintf(stderr,"can't allocate enough memory\n"); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
for(i=0;i<=max_index;i++) |
|
|
|
{ |
|
|
|
feature_max[i]=-DBL_MAX; |
|
|
|
feature_min[i]=DBL_MAX; |
|
|
|
} |
|
|
|
|
|
|
|
/* pass 2: find out min/max value */ |
|
|
|
while(readline(fp)!=NULL) |
|
|
|
{ |
|
|
|
char *p=line; |
|
|
|
int next_index=1; |
|
|
|
double target; |
|
|
|
double value; |
|
|
|
|
|
|
|
if (sscanf(p,"%lf",&target) != 1) |
|
|
|
return clean_up(fp_restore, fp, "ERROR: failed to read labels\n"); |
|
|
|
y_max = max(y_max,target); |
|
|
|
y_min = min(y_min,target); |
|
|
|
|
|
|
|
SKIP_TARGET |
|
|
|
|
|
|
|
while(sscanf(p,"%d:%lf",&index,&value)==2) |
|
|
|
{ |
|
|
|
for(i=next_index;i<index;i++) |
|
|
|
{ |
|
|
|
feature_max[i]=max(feature_max[i],0); |
|
|
|
feature_min[i]=min(feature_min[i],0); |
|
|
|
} |
|
|
|
|
|
|
|
feature_max[index]=max(feature_max[index],value); |
|
|
|
feature_min[index]=min(feature_min[index],value); |
|
|
|
|
|
|
|
SKIP_ELEMENT |
|
|
|
next_index=index+1; |
|
|
|
} |
|
|
|
|
|
|
|
for(i=next_index;i<=max_index;i++) |
|
|
|
{ |
|
|
|
feature_max[i]=max(feature_max[i],0); |
|
|
|
feature_min[i]=min(feature_min[i],0); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
rewind(fp); |
|
|
|
|
|
|
|
/* pass 2.5: save/restore feature_min/feature_max */ |
|
|
|
|
|
|
|
if(restore_filename) |
|
|
|
{ |
|
|
|
/* fp_restore rewinded in finding max_index */ |
|
|
|
int idx, c; |
|
|
|
double fmin, fmax; |
|
|
|
int next_index = 1; |
|
|
|
|
|
|
|
if((c = fgetc(fp_restore)) == 'y') |
|
|
|
{ |
|
|
|
if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 || |
|
|
|
fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2) |
|
|
|
return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n"); |
|
|
|
y_scaling = 1; |
|
|
|
} |
|
|
|
else |
|
|
|
ungetc(c, fp_restore); |
|
|
|
|
|
|
|
if (fgetc(fp_restore) == 'x') |
|
|
|
{ |
|
|
|
if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2) |
|
|
|
return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n"); |
|
|
|
while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3) |
|
|
|
{ |
|
|
|
for(i = next_index;i<idx;i++) |
|
|
|
if(feature_min[i] != feature_max[i]) |
|
|
|
fprintf(stderr, |
|
|
|
"WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n", |
|
|
|
i, argv[argc-1], restore_filename); |
|
|
|
|
|
|
|
feature_min[idx] = fmin; |
|
|
|
feature_max[idx] = fmax; |
|
|
|
|
|
|
|
next_index = idx + 1; |
|
|
|
} |
|
|
|
|
|
|
|
for(i=next_index;i<=max_index;i++) |
|
|
|
if(feature_min[i] != feature_max[i]) |
|
|
|
fprintf(stderr, |
|
|
|
"WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n", |
|
|
|
i, argv[argc-1], restore_filename); |
|
|
|
} |
|
|
|
fclose(fp_restore); |
|
|
|
} |
|
|
|
|
|
|
|
if(save_filename) |
|
|
|
{ |
|
|
|
FILE *fp_save = fopen(save_filename,"w"); |
|
|
|
if(fp_save==NULL) |
|
|
|
{ |
|
|
|
fprintf(stderr,"can't open file %s\n", save_filename); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
if(y_scaling) |
|
|
|
{ |
|
|
|
fprintf(fp_save, "y\n"); |
|
|
|
fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper); |
|
|
|
fprintf(fp_save, "%.16g %.16g\n", y_min, y_max); |
|
|
|
} |
|
|
|
fprintf(fp_save, "x\n"); |
|
|
|
fprintf(fp_save, "%.16g %.16g\n", lower, upper); |
|
|
|
for(i=1;i<=max_index;i++) |
|
|
|
{ |
|
|
|
if(feature_min[i]!=feature_max[i]) |
|
|
|
fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]); |
|
|
|
} |
|
|
|
|
|
|
|
if(min_index < 1) |
|
|
|
fprintf(stderr, |
|
|
|
"WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename); |
|
|
|
|
|
|
|
fclose(fp_save); |
|
|
|
} |
|
|
|
|
|
|
|
/* pass 3: scale */ |
|
|
|
while(readline(fp)!=NULL) |
|
|
|
{ |
|
|
|
char *p=line; |
|
|
|
int next_index=1; |
|
|
|
double target; |
|
|
|
double value; |
|
|
|
|
|
|
|
if (sscanf(p,"%lf",&target) != 1) |
|
|
|
return clean_up(NULL, fp, "ERROR: failed to read labels\n"); |
|
|
|
output_target(target); |
|
|
|
|
|
|
|
SKIP_TARGET |
|
|
|
|
|
|
|
while(sscanf(p,"%d:%lf",&index,&value)==2) |
|
|
|
{ |
|
|
|
for(i=next_index;i<index;i++) |
|
|
|
output(i,0); |
|
|
|
|
|
|
|
output(index,value); |
|
|
|
|
|
|
|
SKIP_ELEMENT |
|
|
|
next_index=index+1; |
|
|
|
} |
|
|
|
|
|
|
|
for(i=next_index;i<=max_index;i++) |
|
|
|
output(i,0); |
|
|
|
|
|
|
|
printf("\n"); |
|
|
|
} |
|
|
|
|
|
|
|
if (new_num_nonzeros > num_nonzeros) |
|
|
|
fprintf(stderr, |
|
|
|
"WARNING: original #nonzeros %ld\n" |
|
|
|
" > new #nonzeros %ld\n" |
|
|
|
"If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n", |
|
|
|
num_nonzeros, new_num_nonzeros); |
|
|
|
|
|
|
|
free(line); |
|
|
|
free(feature_max); |
|
|
|
free(feature_min); |
|
|
|
fclose(fp); |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
char* readline(FILE *input) |
|
|
|
{ |
|
|
|
int len; |
|
|
|
|
|
|
|
if(fgets(line,max_line_len,input) == NULL) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
while(strrchr(line,'\n') == NULL) |
|
|
|
{ |
|
|
|
max_line_len *= 2; |
|
|
|
line = (char *) realloc(line, max_line_len); |
|
|
|
len = (int) strlen(line); |
|
|
|
if(fgets(line+len,max_line_len-len,input) == NULL) |
|
|
|
break; |
|
|
|
} |
|
|
|
return line; |
|
|
|
} |
|
|
|
|
|
|
|
void output_target(double value) |
|
|
|
{ |
|
|
|
if(y_scaling) |
|
|
|
{ |
|
|
|
if(value == y_min) |
|
|
|
value = y_lower; |
|
|
|
else if(value == y_max) |
|
|
|
value = y_upper; |
|
|
|
else value = y_lower + (y_upper-y_lower) * |
|
|
|
(value - y_min)/(y_max-y_min); |
|
|
|
} |
|
|
|
printf("%g ",value); |
|
|
|
} |
|
|
|
|
|
|
|
void output(int index, double value) |
|
|
|
{ |
|
|
|
/* skip single-valued attribute */ |
|
|
|
if(feature_max[index] == feature_min[index]) |
|
|
|
return; |
|
|
|
|
|
|
|
if(value == feature_min[index]) |
|
|
|
value = lower; |
|
|
|
else if(value == feature_max[index]) |
|
|
|
value = upper; |
|
|
|
else |
|
|
|
value = lower + (upper-lower) * |
|
|
|
(value-feature_min[index])/ |
|
|
|
(feature_max[index]-feature_min[index]); |
|
|
|
|
|
|
|
if(value != 0) |
|
|
|
{ |
|
|
|
printf("%d:%g ",index, value); |
|
|
|
new_num_nonzeros++; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
int clean_up(FILE *fp_restore, FILE *fp, const char* msg) |
|
|
|
{ |
|
|
|
fprintf(stderr, "%s", msg); |
|
|
|
free(line); |
|
|
|
free(feature_max); |
|
|
|
free(feature_min); |
|
|
|
fclose(fp); |
|
|
|
if (fp_restore) |
|
|
|
fclose(fp_restore); |
|
|
|
return -1; |
|
|
|
} |
|
|
|
|