OpenI
/
graphkit-learn

#!/usr/bin/env python
__all__ = ['find_parameters']

import os, sys, traceback, getpass, time, re
from threading import Thread
from subprocess import *

if sys.version_info[0] < 3:
	from Queue import Queue
else:
	from queue import Queue

telnet_workers = []
ssh_workers = []
nr_local_worker = 1

class GridOption:
	def __init__(self, dataset_pathname, options):
		dirname = os.path.dirname(__file__)
		if sys.platform != 'win32':
			self.svmtrain_pathname = os.path.join(dirname, '../svm-train')
			self.gnuplot_pathname = '/usr/bin/gnuplot'
		else:
			# example for windows
			self.svmtrain_pathname = os.path.join(dirname, r'..\windows\svm-train.exe')
			# svmtrain_pathname = r'c:\Program Files\libsvm\windows\svm-train.exe'
			self.gnuplot_pathname = r'c:\tmp\gnuplot\binary\pgnuplot.exe'
		self.fold = 5
		self.c_begin, self.c_end, self.c_step = -5,  15,  2
		self.g_begin, self.g_end, self.g_step =  3, -15, -2
		self.grid_with_c, self.grid_with_g = True, True
		self.dataset_pathname = dataset_pathname
		self.dataset_title = os.path.split(dataset_pathname)[1]
		self.out_pathname = '{0}.out'.format(self.dataset_title)
		self.png_pathname = '{0}.png'.format(self.dataset_title)
		self.pass_through_string = ' '
		self.resume_pathname = None
		self.parse_options(options)

	def parse_options(self, options):
		if type(options) == str:
			options = options.split()
		i = 0
		pass_through_options = []
		
		while i < len(options):
			if options[i] == '-log2c':
				i = i + 1
				if options[i] == 'null':
					self.grid_with_c = False
				else:
					self.c_begin, self.c_end, self.c_step = map(float,options[i].split(','))
			elif options[i] == '-log2g':
				i = i + 1
				if options[i] == 'null':
					self.grid_with_g = False
				else:
					self.g_begin, self.g_end, self.g_step = map(float,options[i].split(','))
			elif options[i] == '-v':
				i = i + 1
				self.fold = options[i]
			elif options[i] in ('-c','-g'):
				raise ValueError('Use -log2c and -log2g.')
			elif options[i] == '-svmtrain':
				i = i + 1
				self.svmtrain_pathname = options[i]
			elif options[i] == '-gnuplot':
				i = i + 1
				if options[i] == 'null':
					self.gnuplot_pathname = None
				else:	
					self.gnuplot_pathname = options[i]
			elif options[i] == '-out':
				i = i + 1
				if options[i] == 'null':
					self.out_pathname = None
				else:
					self.out_pathname = options[i]
			elif options[i] == '-png':
				i = i + 1
				self.png_pathname = options[i]
			elif options[i] == '-resume':
				if i == (len(options)-1) or options[i+1].startswith('-'):
					self.resume_pathname = self.dataset_title + '.out'
				else:
					i = i + 1
					self.resume_pathname = options[i]
			else:
				pass_through_options.append(options[i])
			i = i + 1

		self.pass_through_string = ' '.join(pass_through_options)
		if not os.path.exists(self.svmtrain_pathname):
			raise IOError('svm-train executable not found')
		if not os.path.exists(self.dataset_pathname):
			raise IOError('dataset not found')
		if self.resume_pathname and not os.path.exists(self.resume_pathname):
			raise IOError('file for resumption not found')
		if not self.grid_with_c and not self.grid_with_g:
			raise ValueError('-log2c and -log2g should not be null simultaneously')
		if self.gnuplot_pathname and not os.path.exists(self.gnuplot_pathname):
			sys.stderr.write('gnuplot executable not found\n')
			self.gnuplot_pathname = None

def redraw(db,best_param,gnuplot,options,tofile=False):
	if len(db) == 0: return
	begin_level = round(max(x[2] for x in db)) - 3
	step_size = 0.5

	best_log2c,best_log2g,best_rate = best_param

	# if newly obtained c, g, or cv values are the same,
	# then stop redrawing the contour.
	if all(x[0] == db[0][0]  for x in db): return
	if all(x[1] == db[0][1]  for x in db): return
	if all(x[2] == db[0][2]  for x in db): return

	if tofile:
		gnuplot.write(b"set term png transparent small linewidth 2 medium enhanced\n")
		gnuplot.write("set output \"{0}\"\n".format(options.png_pathname.replace('\\','\\\\')).encode())
		#gnuplot.write(b"set term postscript color solid\n")
		#gnuplot.write("set output \"{0}.ps\"\n".format(options.dataset_title).encode().encode())
	elif sys.platform == 'win32':
		gnuplot.write(b"set term windows\n")
	else:
		gnuplot.write( b"set term x11\n")
	gnuplot.write(b"set xlabel \"log2(C)\"\n")
	gnuplot.write(b"set ylabel \"log2(gamma)\"\n")
	gnuplot.write("set xrange [{0}:{1}]\n".format(options.c_begin,options.c_end).encode())
	gnuplot.write("set yrange [{0}:{1}]\n".format(options.g_begin,options.g_end).encode())
	gnuplot.write(b"set contour\n")
	gnuplot.write("set cntrparam levels incremental {0},{1},100\n".format(begin_level,step_size).encode())
	gnuplot.write(b"unset surface\n")
	gnuplot.write(b"unset ztics\n")
	gnuplot.write(b"set view 0,0\n")
	gnuplot.write("set title \"{0}\"\n".format(options.dataset_title).encode())
	gnuplot.write(b"unset label\n")
	gnuplot.write("set label \"Best log2(C) = {0}  log2(gamma) = {1}  accuracy = {2}%\" \
				  at screen 0.5,0.85 center\n". \
				  format(best_log2c, best_log2g, best_rate).encode())
	gnuplot.write("set label \"C = {0}  gamma = {1}\""
				  " at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode())
	gnuplot.write(b"set key at screen 0.9,0.9\n")
	gnuplot.write(b"splot \"-\" with lines\n")
	
	db.sort(key = lambda x:(x[0], -x[1]))

	prevc = db[0][0]
	for line in db:
		if prevc != line[0]:
			gnuplot.write(b"\n")
			prevc = line[0]
		gnuplot.write("{0[0]} {0[1]} {0[2]}\n".format(line).encode())
	gnuplot.write(b"e\n")
	gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure
	gnuplot.flush()


def calculate_jobs(options):
	
	def range_f(begin,end,step):
		# like range, but works on non-integer too
		seq = []
		while True:
			if step > 0 and begin > end: break
			if step < 0 and begin < end: break
			seq.append(begin)
			begin = begin + step
		return seq
	
	def permute_sequence(seq):
		n = len(seq)
		if n <= 1: return seq
	
		mid = int(n/2)
		left = permute_sequence(seq[:mid])
		right = permute_sequence(seq[mid+1:])
	
		ret = [seq[mid]]
		while left or right:
			if left: ret.append(left.pop(0))
			if right: ret.append(right.pop(0))
			
		return ret	

	
	c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step))
	g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step))

	if not options.grid_with_c:
		c_seq = [None]
	if not options.grid_with_g:
		g_seq = [None] 
	
	nr_c = float(len(c_seq))
	nr_g = float(len(g_seq))
	i, j = 0, 0
	jobs = []

	while i < nr_c or j < nr_g:
		if i/nr_c < j/nr_g:
			# increase C resolution
			line = []
			for k in range(0,j):
				line.append((c_seq[i],g_seq[k]))
			i = i + 1
			jobs.append(line)
		else:
			# increase g resolution
			line = []
			for k in range(0,i):
				line.append((c_seq[k],g_seq[j]))
			j = j + 1
			jobs.append(line)

	resumed_jobs = {}
	
	if options.resume_pathname is None:
		return jobs, resumed_jobs

	for line in open(options.resume_pathname, 'r'):
		line = line.strip()
		rst = re.findall(r'rate=([0-9.]+)',line)
		if not rst: 
			continue
		rate = float(rst[0])

		c, g = None, None 
		rst = re.findall(r'log2c=([0-9.-]+)',line)
		if rst: 
			c = float(rst[0])
		rst = re.findall(r'log2g=([0-9.-]+)',line)
		if rst: 
			g = float(rst[0])

		resumed_jobs[(c,g)] = rate

	return jobs, resumed_jobs

	
class WorkerStopToken:  # used to notify the worker to stop or if a worker is dead
	pass

class Worker(Thread):
	def __init__(self,name,job_queue,result_queue,options):
		Thread.__init__(self)
		self.name = name
		self.job_queue = job_queue
		self.result_queue = result_queue
		self.options = options
		
	def run(self):
		while True:
			(cexp,gexp) = self.job_queue.get()
			if cexp is WorkerStopToken:
				self.job_queue.put((cexp,gexp))
				# print('worker {0} stop.'.format(self.name))
				break
			try:
				c, g = None, None
				if cexp != None:
					c = 2.0**cexp
				if gexp != None:
					g = 2.0**gexp
				rate = self.run_one(c,g)
				if rate is None: raise RuntimeError('get no rate')
			except:
				# we failed, let others do that and we just quit
			
				traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
				
				self.job_queue.put((cexp,gexp))
				sys.stderr.write('worker {0} quit.\n'.format(self.name))
				break
			else:
				self.result_queue.put((self.name,cexp,gexp,rate))

	def get_cmd(self,c,g):
		options=self.options
		cmdline = '"' + options.svmtrain_pathname + '"'
		if options.grid_with_c: 
			cmdline += ' -c {0} '.format(c)
		if options.grid_with_g: 
			cmdline += ' -g {0} '.format(g)
		cmdline += ' -v {0} {1} {2} '.format\
			(options.fold,options.pass_through_string,options.dataset_pathname)
		return cmdline
		
class LocalWorker(Worker):
	def run_one(self,c,g):
		cmdline = self.get_cmd(c,g)
		result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
		for line in result.readlines():
			if str(line).find('Cross') != -1:
				return float(line.split()[-1][0:-1])

class SSHWorker(Worker):
	def __init__(self,name,job_queue,result_queue,host,options):
		Worker.__init__(self,name,job_queue,result_queue,options)
		self.host = host
		self.cwd = os.getcwd()
	def run_one(self,c,g):
		cmdline = 'ssh -x -t -t {0} "cd {1}; {2}"'.format\
			(self.host,self.cwd,self.get_cmd(c,g))
		result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
		for line in result.readlines():
			if str(line).find('Cross') != -1:
				return float(line.split()[-1][0:-1])

class TelnetWorker(Worker):
	def __init__(self,name,job_queue,result_queue,host,username,password,options):
		Worker.__init__(self,name,job_queue,result_queue,options)
		self.host = host
		self.username = username
		self.password = password		
	def run(self):
		import telnetlib
		self.tn = tn = telnetlib.Telnet(self.host)
		tn.read_until('login: ')
		tn.write(self.username + '\n')
		tn.read_until('Password: ')
		tn.write(self.password + '\n')

		# XXX: how to know whether login is successful?
		tn.read_until(self.username)
		# 
		print('login ok', self.host)
		tn.write('cd '+os.getcwd()+'\n')
		Worker.run(self)
		tn.write('exit\n')			   
	def run_one(self,c,g):
		cmdline = self.get_cmd(c,g)
		result = self.tn.write(cmdline+'\n')
		(idx,matchm,output) = self.tn.expect(['Cross.*\n'])
		for line in output.split('\n'):
			if str(line).find('Cross') != -1:
				return float(line.split()[-1][0:-1])
			
def find_parameters(dataset_pathname, options=''):
	
	def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed):
		if (rate > best_rate) or (rate==best_rate and g==best_g and c<best_c):
			best_rate,best_c,best_g = rate,c,g
		stdout_str = '[{0}] {1} {2} (best '.format\
			(worker,' '.join(str(x) for x in [c,g] if x is not None),rate)
		output_str = ''
		if c != None:
			stdout_str += 'c={0}, '.format(2.0**best_c)
			output_str += 'log2c={0} '.format(c)
		if g != None:
			stdout_str += 'g={0}, '.format(2.0**best_g)
			output_str += 'log2g={0} '.format(g)
		stdout_str += 'rate={0})'.format(best_rate)
		print(stdout_str)
		if options.out_pathname and not resumed:
			output_str += 'rate={0}\n'.format(rate)
			result_file.write(output_str)
			result_file.flush()
		
		return best_c,best_g,best_rate
		
	options = GridOption(dataset_pathname, options);

	if options.gnuplot_pathname:
		gnuplot = Popen(options.gnuplot_pathname,stdin = PIPE,stdout=PIPE,stderr=PIPE).stdin
	else:
		gnuplot = None
		
	# put jobs in queue

	jobs,resumed_jobs = calculate_jobs(options)
	job_queue = Queue(0)
	result_queue = Queue(0)

	for (c,g) in resumed_jobs:
		result_queue.put(('resumed',c,g,resumed_jobs[(c,g)]))

	for line in jobs:
		for (c,g) in line:
			if (c,g) not in resumed_jobs:
				job_queue.put((c,g))

	# hack the queue to become a stack --
	# this is important when some thread
	# failed and re-put a job. It we still
	# use FIFO, the job will be put
	# into the end of the queue, and the graph
	# will only be updated in the end
 
	job_queue._put = job_queue.queue.appendleft

	# fire telnet workers

	if telnet_workers:
		nr_telnet_worker = len(telnet_workers)
		username = getpass.getuser()
		password = getpass.getpass()
		for host in telnet_workers:
			worker = TelnetWorker(host,job_queue,result_queue,
					 host,username,password,options)
			worker.start()

	# fire ssh workers

	if ssh_workers:
		for host in ssh_workers:
			worker = SSHWorker(host,job_queue,result_queue,host,options)
			worker.start()

	# fire local workers

	for i in range(nr_local_worker):
		worker = LocalWorker('local',job_queue,result_queue,options)
		worker.start()

	# gather results

	done_jobs = {}

	if options.out_pathname:
		if options.resume_pathname:
			result_file = open(options.out_pathname, 'a')
		else:
			result_file = open(options.out_pathname, 'w')


	db = []
	best_rate = -1
	best_c,best_g = None,None  

	for (c,g) in resumed_jobs:
		rate = resumed_jobs[(c,g)]
		best_c,best_g,best_rate = update_param(c,g,rate,best_c,best_g,best_rate,'resumed',True)

	for line in jobs:
		for (c,g) in line:
			while (c,g) not in done_jobs:
				(worker,c1,g1,rate1) = result_queue.get()
				done_jobs[(c1,g1)] = rate1
				if (c1,g1) not in resumed_jobs:
					best_c,best_g,best_rate = update_param(c1,g1,rate1,best_c,best_g,best_rate,worker,False)
			db.append((c,g,done_jobs[(c,g)]))
		if gnuplot and options.grid_with_c and options.grid_with_g:
			redraw(db,[best_c, best_g, best_rate],gnuplot,options)
			redraw(db,[best_c, best_g, best_rate],gnuplot,options,True)


	if options.out_pathname:
		result_file.close()
	job_queue.put((WorkerStopToken,None))
	best_param, best_cg  = {}, []
	if best_c != None:
		best_param['c'] = 2.0**best_c
		best_cg += [2.0**best_c]
	if best_g != None:
		best_param['g'] = 2.0**best_g
		best_cg += [2.0**best_g]
	print('{0} {1}'.format(' '.join(map(str,best_cg)), best_rate))

	return best_rate, best_param


if __name__ == '__main__':

	def exit_with_help():
		print("""\
Usage: grid.py [grid_options] [svm_options] dataset

grid_options :
-log2c {begin,end,step | "null"} : set the range of c (default -5,15,2)
    begin,end,step -- c_range = 2^{begin,...,begin+k*step,...,end}
    "null"         -- do not grid with c
-log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2)
    begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end}
    "null"         -- do not grid with g
-v n : n-fold cross validation (default 5)
-svmtrain pathname : set svm executable path and name
-gnuplot {pathname | "null"} :
    pathname -- set gnuplot executable path and name
    "null"   -- do not plot 
-out {pathname | "null"} : (default dataset.out)
    pathname -- set output file path and name
    "null"   -- do not output file
-png pathname : set graphic output file path and name (default dataset.png)
-resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out)
    This is experimental. Try this option only if some parameters have been checked for the SAME data.

svm_options : additional options for svm-train""")
		sys.exit(1)
	
	if len(sys.argv) < 2:
		exit_with_help()
	dataset_pathname = sys.argv[-1]
	options = sys.argv[1:-1]
	try:
		find_parameters(dataset_pathname, options)
	except (IOError,ValueError) as e:
		sys.stderr.write(str(e) + '\n')
		sys.stderr.write('Try "grid.py" for more information.\n')
		sys.exit(1)