Do not pass backticks from author names into gnuplot command file

Fix minor documentation issues
* author.txt was renamed to AUTHOR * use git shortlog instead of git-shortlog because the latter is not necessarily in PATH Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>
2013-11-04 08:47:28 +00:00 · 2013-09-29 10:20:52 +03:00 · 2013-07-29 17:42:36 +03:00 · 2013-07-29 17:39:13 +03:00 · 2013-07-26 22:25:00 +03:00
3 changed files with 79 additions and 37 deletions
--- a/doc/AUTHOR
+++ b/doc/AUTHOR
@ -2,7 +2,7 @@ Author can be reached by sending e-mail to <hoxu@users.sf.net>.
 Include "gitstats" in the subject or prepare to battle the spam filters.

 See the following command for list of authors who have contributed:
-  $ git-shortlog HEAD
+  $ git shortlog HEAD

 Also thanks to the following people:
 Alexander Botero-Lowry
--- a/doc/gitstats.pod
+++ b/doc/gitstats.pod
@ -53,6 +53,10 @@ How many domains to show in domains by commits.

 Maximum file extension length.

+=item processes
+
+Number of concurrent processes to use when extracting git repository data.
+
 =item project_name

 Project name to show on the generated pages. Default is to use basename of the repository directory.
--- a/110
+++ b/110
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2007-2012 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/author.txt)
+# Copyright (c) 2007-2013 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/AUTHOR)
 # GPLv2 / GPLv3
 import datetime
 import getopt
@ -14,6 +14,12 @@ import sys
 import time
 import zlib

+if sys.version_info < (2, 6):
+       print >> sys.stderr, "Python 2.6 or higher is required for gitstats"
+       sys.exit(1)
+
+from multiprocessing import Pool
+
 os.environ['LC_ALL'] = 'C'

 GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
@ -40,7 +46,8 @@ conf = {
 	'commit_end': 'HEAD',
 	'linear_linestats': 1,
 	'project_name': '',
-	'merge_authors': {}
+	'merge_authors': {},
+	'processes': 8,
 }

 def getpipeoutput(cmds, quiet = False):
@ -104,6 +111,20 @@ def getgitversion():
 def getgnuplotversion():
 	return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]

+def getnumoffilesfromrev(time_rev):
+	"""
+	Get number of files changed in commit
+	"""
+	time, rev = time_rev
+	return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
+
+def getnumoflinesinblob(ext_blob):
+	"""
+	Get number of lines in blob
+	"""
+	ext, blob_id = ext_blob
+	return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0]))
+
 class DataCollector:
 	"""Manages data collection from a revision control repository."""
 	def __init__(self):
@ -408,14 +429,34 @@ class GitDataCollector(DataCollector):
 			# timezone
 			self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1

-		# TODO Optimize this, it's the worst bottleneck
 		# outputs "<stamp> <files>" for each revision
 		revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
 		lines = []
+		revs_to_read = []
+		time_rev_count = []
+		#Look up rev in cache and take info from cache if found
+		#If not append rev to list of rev to read from repo
 		for revline in revlines:
 			time, rev = revline.split(' ')
-			linecount = self.getFilesInCommit(rev)
-			lines.append('%d %d' % (int(time), linecount))
+			#if cache empty then add time and rev to list of new rev's
+			#otherwise try to read needed info from cache
+			if 'files_in_tree' not in self.cache.keys():
+				revs_to_read.append((time,rev))
+				continue
+			if rev in self.cache['files_in_tree'].keys():
+				lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
+			else:
+				revs_to_read.append((time,rev))
+
+		#Read revisions from repo
+		time_rev_count = Pool(processes=conf['processes']).map(getnumoffilesfromrev, revs_to_read)
+
+		#Update cache with new revisions and append then to general list
+		for (time, rev, count) in time_rev_count:
+			if 'files_in_tree' not in self.cache:
+				self.cache['files_in_tree'] = {}
+			self.cache['files_in_tree'][rev] = count
+			lines.append('%d %d' % (int(time), count))

 		self.total_commits += len(lines)
 		for line in lines:
@ -430,6 +471,7 @@ class GitDataCollector(DataCollector):

 		# extensions and size of files
 		lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
+		blobs_to_read = []
 		for line in lines:
 			if len(line) == 0:
 				continue
@ -437,7 +479,7 @@ class GitDataCollector(DataCollector):
 			if parts[0] == '160000' and parts[3] == '-':
 				# skip submodules
 				continue
-			sha1 = parts[2]
+			blob_id = parts[2]
 			size = int(parts[3])
 			fullpath = parts[4]

@ -451,15 +493,28 @@ class GitDataCollector(DataCollector):
 				ext = filename[(filename.rfind('.') + 1):]
 			if len(ext) > conf['max_ext_length']:
 				ext = ''
-
 			if ext not in self.extensions:
 				self.extensions[ext] = {'files': 0, 'lines': 0}
-
 			self.extensions[ext]['files'] += 1
-			try:
-				self.extensions[ext]['lines'] += self.getLinesInBlob(sha1)
-			except:
-				print 'Warning: Could not count lines for file "%s"' % line
+			#if cache empty then add ext and blob id to list of new blob's
+			#otherwise try to read needed info from cache
+			if 'lines_in_blob' not in self.cache.keys():
+				blobs_to_read.append((ext,blob_id))
+				continue
+			if blob_id in self.cache['lines_in_blob'].keys():
+				self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
+			else:
+				blobs_to_read.append((ext,blob_id))
+
+		#Get info abount line count for new blob's that wasn't found in cache
+		ext_blob_linecount = Pool(processes=24).map(getnumoflinesinblob, blobs_to_read)
+
+		#Update cache and write down info about number of number of lines
+		for (ext, blob_id, linecount) in ext_blob_linecount:
+			if 'lines_in_blob' not in self.cache:
+				self.cache['lines_in_blob'] = {}
+			self.cache['lines_in_blob'][blob_id] = linecount
+			self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]

 		# line statistics
 		# outputs:
@ -619,33 +674,12 @@ class GitDataCollector(DataCollector):
 	def getDomains(self):
 		return self.domains.keys()
 	
-	def getFilesInCommit(self, rev):
-		try:
-			res = self.cache['files_in_tree'][rev]
-		except:
-			res = int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])
-			if 'files_in_tree' not in self.cache:
-				self.cache['files_in_tree'] = {}
-			self.cache['files_in_tree'][rev] = res
-
-		return res
-
 	def getFirstCommitDate(self):
 		return datetime.datetime.fromtimestamp(self.first_commit_stamp)
 	
 	def getLastCommitDate(self):
 		return datetime.datetime.fromtimestamp(self.last_commit_stamp)
 	
-	def getLinesInBlob(self, sha1):
-		try:
-			res = self.cache['lines_in_blob'][sha1]
-		except:
-			res = int(getpipeoutput(['git cat-file blob %s' % sha1, 'wc -l']).split()[0])
-			if 'lines_in_blob' not in self.cache:
-				self.cache['lines_in_blob'] = {}
-			self.cache['lines_in_blob'][sha1] = res
-		return res
-
 	def getTags(self):
 		lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
 		return lines.split('\n')
@ -1086,7 +1120,7 @@ class HTMLReportCreator(ReportCreator):
 		f.write('</dl>\n')

 		f.write(html_header(2, 'Lines of Code'))
-		f.write('<img src="lines_of_code.png" />')
+                f.write('<img src="lines_of_code.png" />')

 		fg = open(path + '/lines_of_code.dat', 'w')
 		for stamp in sorted(data.changes_by_date.keys()):
@ -1280,7 +1314,9 @@ plot """
 		plots = []
 		for a in self.authors_to_plot:
 			i = i + 1
-			plots.append("""'lines_of_code_by_author.dat' using 1:%d title "%s" w lines""" % (i, a.replace("\"", "\\\"")))
+			a = a.replace("\"", "\\\"")
+                        a = a.replace('`', '')
+			plots.append("""'lines_of_code_by_author.dat' using 1:%d title "%s" w lines""" % (i, a))
 		f.write(", ".join(plots))
 		f.write('\n')

@ -1307,7 +1343,9 @@ plot """
 		plots = []
 		for a in self.authors_to_plot:
 			i = i + 1
-			plots.append("""'commits_by_author.dat' using 1:%d title "%s" w lines""" % (i, a.replace("\"", "\\\"")))
+			a = a.replace("\"", "\\\"")
+                        a = a.replace('`', '')
+			plots.append("""'commits_by_author.dat' using 1:%d title "%s" w lines""" % (i, a))
 		f.write(", ".join(plots))
 		f.write('\n')
Author	SHA1	Message	Date
Ciaran Gultnieks	fb0b4bae3c	Do not pass backticks from author names into gnuplot command file	2013-11-04 08:47:28 +00:00
Alexander Strasser	a664c2eb6b	Fix minor documentation issues * author.txt was renamed to AUTHOR * use git shortlog instead of git-shortlog because the latter is not necessarily in PATH Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>	2013-09-29 10:20:52 +03:00
Heikki Hokkanen	c447e55a7a	Make number of processes configurable. Default to 8.	2013-07-29 17:42:36 +03:00
Andrey Devyatkin	8647c75d48	Fix performance issue for huge repositories Problem: gitstats will read every commit and every file in repository in one thread during initial statistics generation (i.e. no cache available). It may take much time in case of huge repositories (100 000+ files) Solution: Execute all read commands in 24 threads instead of one Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>	2013-07-29 17:39:13 +03:00
Heikki Hokkanen	31ff072d23	Bump copyright year.	2013-07-26 22:25:00 +03:00