Fix performance issue for huge repositories

Problem: gitstats will read every commit and every file in repository in one thread during initial statistics generation (i.e. no cache available). It may take much time in case of huge repositories (100 000+ files) Solution: Execute all read commands in 24 threads instead of one Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>
2012-12-16 14:22:19 +01:00 · 2012-12-16 14:22:19 +01:00 · 8647c75d48
commit 8647c75d48
parent 31ff072d23
1 changed files with 64 additions and 31 deletions
--- a/95
+++ b/95
@ -14,6 +14,12 @@ import sys
 import time
 import zlib
 if sys.version_info < (2, 6):
       print >> sys.stderr, "Python 2.6 or higher is required for gitstats"
       sys.exit(1)
 from multiprocessing import Pool
 os.environ['LC_ALL'] = 'C'
 GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
@ -104,6 +110,20 @@ def getgitversion():
 def getgnuplotversion():
 	return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
 def getnumoffilesfromrev(time_rev):
 	"""
 	Get number of files changed in commit
 	"""
 	time, rev = time_rev
 	return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
 def getnumoflinesinblob(ext_blob):
 	"""
 	Get number of lines in blob
 	"""
 	ext, blob_id = ext_blob
 	return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0]))
 class DataCollector:
 	"""Manages data collection from a revision control repository."""
 	def __init__(self):
@ -408,14 +428,34 @@ class GitDataCollector(DataCollector):
 			# timezone
 			self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
 		# TODO Optimize this, it's the worst bottleneck
 		# outputs "<stamp> <files>" for each revision
 		revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
 		lines = []
 		revs_to_read = []
 		time_rev_count = []
 		#Look up rev in cache and take info from cache if found
 		#If not append rev to list of rev to read from repo
 		for revline in revlines:
 			time, rev = revline.split(' ')
-			linecount = self.getFilesInCommit(rev)
+			#if cache empty then add time and rev to list of new rev's
-			lines.append('%d %d' % (int(time), linecount))
+			#otherwise try to read needed info from cache
 			if 'files_in_tree' not in self.cache.keys():
 				revs_to_read.append((time,rev))
 				continue
 			if rev in self.cache['files_in_tree'].keys():
 				lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
 			else:
 				revs_to_read.append((time,rev))
 		#Read revisions from repo
 		time_rev_count = Pool(processes=24).map(getnumoffilesfromrev, revs_to_read)
 		#Update cache with new revisions and append then to general list
 		for (time, rev, count) in time_rev_count:
 			if 'files_in_tree' not in self.cache:
 				self.cache['files_in_tree'] = {}
 			self.cache['files_in_tree'][rev] = count
 			lines.append('%d %d' % (int(time), count))
 		self.total_commits += len(lines)
 		for line in lines:
@ -430,6 +470,7 @@ class GitDataCollector(DataCollector):
 		# extensions and size of files
 		lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
 		blobs_to_read = []
 		for line in lines:
 			if len(line) == 0:
 				continue
@ -437,7 +478,7 @@ class GitDataCollector(DataCollector):
 			if parts[0] == '160000' and parts[3] == '-':
 				# skip submodules
 				continue
-			sha1 = parts[2]
+			blob_id = parts[2]
 			size = int(parts[3])
 			fullpath = parts[4]
@ -451,15 +492,28 @@ class GitDataCollector(DataCollector):
 				ext = filename[(filename.rfind('.') + 1):]
 			if len(ext) > conf['max_ext_length']:
 				ext = ''
 			if ext not in self.extensions:
 				self.extensions[ext] = {'files': 0, 'lines': 0}
 			self.extensions[ext]['files'] += 1
-			try:
+			#if cache empty then add ext and blob id to list of new blob's
-				self.extensions[ext]['lines'] += self.getLinesInBlob(sha1)
+			#otherwise try to read needed info from cache
-			except:
+			if 'lines_in_blob' not in self.cache.keys():
-				print 'Warning: Could not count lines for file "%s"' % line
+				blobs_to_read.append((ext,blob_id))
 				continue
 			if blob_id in self.cache['lines_in_blob'].keys():
 				self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
 			else:
 				blobs_to_read.append((ext,blob_id))
 		#Get info abount line count for new blob's that wasn't found in cache
 		ext_blob_linecount = Pool(processes=24).map(getnumoflinesinblob, blobs_to_read)
 		#Update cache and write down info about number of number of lines
 		for (ext, blob_id, linecount) in ext_blob_linecount:
 			if 'lines_in_blob' not in self.cache:
 				self.cache['lines_in_blob'] = {}
 			self.cache['lines_in_blob'][blob_id] = linecount
 			self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
 		# line statistics
 		# outputs:
@ -619,33 +673,12 @@ class GitDataCollector(DataCollector):
 	def getDomains(self):
 		return self.domains.keys()
 	def getFilesInCommit(self, rev):
 		try:
 			res = self.cache['files_in_tree'][rev]
 		except:
 			res = int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])
 			if 'files_in_tree' not in self.cache:
 				self.cache['files_in_tree'] = {}
 			self.cache['files_in_tree'][rev] = res
 		return res
 	def getFirstCommitDate(self):
 		return datetime.datetime.fromtimestamp(self.first_commit_stamp)
 	def getLastCommitDate(self):
 		return datetime.datetime.fromtimestamp(self.last_commit_stamp)
 	def getLinesInBlob(self, sha1):
 		try:
 			res = self.cache['lines_in_blob'][sha1]
 		except:
 			res = int(getpipeoutput(['git cat-file blob %s' % sha1, 'wc -l']).split()[0])
 			if 'lines_in_blob' not in self.cache:
 				self.cache['lines_in_blob'] = {}
 			self.cache['lines_in_blob'][sha1] = res
 		return res
 	def getTags(self):
 		lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
 		return lines.split('\n')