|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397 |
- #include <float.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <ctype.h>
- #include <string.h>
-
- void exit_with_help()
- {
- printf(
- "Usage: svm-scale [options] data_filename\n"
- "options:\n"
- "-l lower : x scaling lower limit (default -1)\n"
- "-u upper : x scaling upper limit (default +1)\n"
- "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
- "-s save_filename : save scaling parameters to save_filename\n"
- "-r restore_filename : restore scaling parameters from restore_filename\n"
- );
- exit(1);
- }
-
- char *line = NULL;
- int max_line_len = 1024;
- double lower=-1.0,upper=1.0,y_lower,y_upper;
- int y_scaling = 0;
- double *feature_max;
- double *feature_min;
- double y_max = -DBL_MAX;
- double y_min = DBL_MAX;
- int max_index;
- int min_index;
- long int num_nonzeros = 0;
- long int new_num_nonzeros = 0;
-
- #define max(x,y) (((x)>(y))?(x):(y))
- #define min(x,y) (((x)<(y))?(x):(y))
-
- void output_target(double value);
- void output(int index, double value);
- char* readline(FILE *input);
- int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
-
- int main(int argc,char **argv)
- {
- int i,index;
- FILE *fp, *fp_restore = NULL;
- char *save_filename = NULL;
- char *restore_filename = NULL;
-
- for(i=1;i<argc;i++)
- {
- if(argv[i][0] != '-') break;
- ++i;
- switch(argv[i-1][1])
- {
- case 'l': lower = atof(argv[i]); break;
- case 'u': upper = atof(argv[i]); break;
- case 'y':
- y_lower = atof(argv[i]);
- ++i;
- y_upper = atof(argv[i]);
- y_scaling = 1;
- break;
- case 's': save_filename = argv[i]; break;
- case 'r': restore_filename = argv[i]; break;
- default:
- fprintf(stderr,"unknown option\n");
- exit_with_help();
- }
- }
-
- if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
- {
- fprintf(stderr,"inconsistent lower/upper specification\n");
- exit(1);
- }
-
- if(restore_filename && save_filename)
- {
- fprintf(stderr,"cannot use -r and -s simultaneously\n");
- exit(1);
- }
-
- if(argc != i+1)
- exit_with_help();
-
- fp=fopen(argv[i],"r");
-
- if(fp==NULL)
- {
- fprintf(stderr,"can't open file %s\n", argv[i]);
- exit(1);
- }
-
- line = (char *) malloc(max_line_len*sizeof(char));
-
- #define SKIP_TARGET\
- while(isspace(*p)) ++p;\
- while(!isspace(*p)) ++p;
-
- #define SKIP_ELEMENT\
- while(*p!=':') ++p;\
- ++p;\
- while(isspace(*p)) ++p;\
- while(*p && !isspace(*p)) ++p;
-
- /* assumption: min index of attributes is 1 */
- /* pass 1: find out max index of attributes */
- max_index = 0;
- min_index = 1;
-
- if(restore_filename)
- {
- int idx, c;
-
- fp_restore = fopen(restore_filename,"r");
- if(fp_restore==NULL)
- {
- fprintf(stderr,"can't open file %s\n", restore_filename);
- exit(1);
- }
-
- c = fgetc(fp_restore);
- if(c == 'y')
- {
- readline(fp_restore);
- readline(fp_restore);
- readline(fp_restore);
- }
- readline(fp_restore);
- readline(fp_restore);
-
- while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
- max_index = max(idx,max_index);
- rewind(fp_restore);
- }
-
- while(readline(fp)!=NULL)
- {
- char *p=line;
-
- SKIP_TARGET
-
- while(sscanf(p,"%d:%*f",&index)==1)
- {
- max_index = max(max_index, index);
- min_index = min(min_index, index);
- SKIP_ELEMENT
- num_nonzeros++;
- }
- }
-
- if(min_index < 1)
- fprintf(stderr,
- "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
-
- rewind(fp);
-
- feature_max = (double *)malloc((max_index+1)* sizeof(double));
- feature_min = (double *)malloc((max_index+1)* sizeof(double));
-
- if(feature_max == NULL || feature_min == NULL)
- {
- fprintf(stderr,"can't allocate enough memory\n");
- exit(1);
- }
-
- for(i=0;i<=max_index;i++)
- {
- feature_max[i]=-DBL_MAX;
- feature_min[i]=DBL_MAX;
- }
-
- /* pass 2: find out min/max value */
- while(readline(fp)!=NULL)
- {
- char *p=line;
- int next_index=1;
- double target;
- double value;
-
- if (sscanf(p,"%lf",&target) != 1)
- return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
- y_max = max(y_max,target);
- y_min = min(y_min,target);
-
- SKIP_TARGET
-
- while(sscanf(p,"%d:%lf",&index,&value)==2)
- {
- for(i=next_index;i<index;i++)
- {
- feature_max[i]=max(feature_max[i],0);
- feature_min[i]=min(feature_min[i],0);
- }
-
- feature_max[index]=max(feature_max[index],value);
- feature_min[index]=min(feature_min[index],value);
-
- SKIP_ELEMENT
- next_index=index+1;
- }
-
- for(i=next_index;i<=max_index;i++)
- {
- feature_max[i]=max(feature_max[i],0);
- feature_min[i]=min(feature_min[i],0);
- }
- }
-
- rewind(fp);
-
- /* pass 2.5: save/restore feature_min/feature_max */
-
- if(restore_filename)
- {
- /* fp_restore rewinded in finding max_index */
- int idx, c;
- double fmin, fmax;
- int next_index = 1;
-
- if((c = fgetc(fp_restore)) == 'y')
- {
- if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
- fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
- return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
- y_scaling = 1;
- }
- else
- ungetc(c, fp_restore);
-
- if (fgetc(fp_restore) == 'x')
- {
- if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
- return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
- while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
- {
- for(i = next_index;i<idx;i++)
- if(feature_min[i] != feature_max[i])
- fprintf(stderr,
- "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
- i, argv[argc-1], restore_filename);
-
- feature_min[idx] = fmin;
- feature_max[idx] = fmax;
-
- next_index = idx + 1;
- }
-
- for(i=next_index;i<=max_index;i++)
- if(feature_min[i] != feature_max[i])
- fprintf(stderr,
- "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s.\n",
- i, argv[argc-1], restore_filename);
- }
- fclose(fp_restore);
- }
-
- if(save_filename)
- {
- FILE *fp_save = fopen(save_filename,"w");
- if(fp_save==NULL)
- {
- fprintf(stderr,"can't open file %s\n", save_filename);
- exit(1);
- }
- if(y_scaling)
- {
- fprintf(fp_save, "y\n");
- fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
- fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
- }
- fprintf(fp_save, "x\n");
- fprintf(fp_save, "%.16g %.16g\n", lower, upper);
- for(i=1;i<=max_index;i++)
- {
- if(feature_min[i]!=feature_max[i])
- fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
- }
-
- if(min_index < 1)
- fprintf(stderr,
- "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
-
- fclose(fp_save);
- }
-
- /* pass 3: scale */
- while(readline(fp)!=NULL)
- {
- char *p=line;
- int next_index=1;
- double target;
- double value;
-
- if (sscanf(p,"%lf",&target) != 1)
- return clean_up(NULL, fp, "ERROR: failed to read labels\n");
- output_target(target);
-
- SKIP_TARGET
-
- while(sscanf(p,"%d:%lf",&index,&value)==2)
- {
- for(i=next_index;i<index;i++)
- output(i,0);
-
- output(index,value);
-
- SKIP_ELEMENT
- next_index=index+1;
- }
-
- for(i=next_index;i<=max_index;i++)
- output(i,0);
-
- printf("\n");
- }
-
- if (new_num_nonzeros > num_nonzeros)
- fprintf(stderr,
- "WARNING: original #nonzeros %ld\n"
- " > new #nonzeros %ld\n"
- "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
- num_nonzeros, new_num_nonzeros);
-
- free(line);
- free(feature_max);
- free(feature_min);
- fclose(fp);
- return 0;
- }
-
- char* readline(FILE *input)
- {
- int len;
-
- if(fgets(line,max_line_len,input) == NULL)
- return NULL;
-
- while(strrchr(line,'\n') == NULL)
- {
- max_line_len *= 2;
- line = (char *) realloc(line, max_line_len);
- len = (int) strlen(line);
- if(fgets(line+len,max_line_len-len,input) == NULL)
- break;
- }
- return line;
- }
-
- void output_target(double value)
- {
- if(y_scaling)
- {
- if(value == y_min)
- value = y_lower;
- else if(value == y_max)
- value = y_upper;
- else value = y_lower + (y_upper-y_lower) *
- (value - y_min)/(y_max-y_min);
- }
- printf("%g ",value);
- }
-
- void output(int index, double value)
- {
- /* skip single-valued attribute */
- if(feature_max[index] == feature_min[index])
- return;
-
- if(value == feature_min[index])
- value = lower;
- else if(value == feature_max[index])
- value = upper;
- else
- value = lower + (upper-lower) *
- (value-feature_min[index])/
- (feature_max[index]-feature_min[index]);
-
- if(value != 0)
- {
- printf("%d:%g ",index, value);
- new_num_nonzeros++;
- }
- }
-
- int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
- {
- fprintf(stderr, "%s", msg);
- free(line);
- free(feature_max);
- free(feature_min);
- fclose(fp);
- if (fp_restore)
- fclose(fp_restore);
- return -1;
- }
-
|