Compare commits

...

5 Commits

Author SHA1 Message Date
Ciaran Gultnieks
fb0b4bae3c Do not pass backticks from author names into gnuplot command file 2013-11-04 08:47:28 +00:00
Alexander Strasser
a664c2eb6b Fix minor documentation issues
* author.txt was renamed to AUTHOR
* use git shortlog instead of git-shortlog
  because the latter is not necessarily in PATH

Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>
2013-09-29 10:20:52 +03:00
Heikki Hokkanen
c447e55a7a Make number of processes configurable.
Default to 8.
2013-07-29 17:42:36 +03:00
Andrey Devyatkin
8647c75d48 Fix performance issue for huge repositories
Problem: gitstats will read every commit and every file in repository in one
thread during initial statistics generation (i.e. no cache available). It may
take much time in case of huge repositories (100 000+ files) Solution: Execute
all read commands in 24 threads instead of one

Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>
2013-07-29 17:39:13 +03:00
Heikki Hokkanen
31ff072d23 Bump copyright year. 2013-07-26 22:25:00 +03:00
3 changed files with 79 additions and 37 deletions

View File

@ -2,7 +2,7 @@ Author can be reached by sending e-mail to <hoxu@users.sf.net>.
Include "gitstats" in the subject or prepare to battle the spam filters. Include "gitstats" in the subject or prepare to battle the spam filters.
See the following command for list of authors who have contributed: See the following command for list of authors who have contributed:
$ git-shortlog HEAD $ git shortlog HEAD
Also thanks to the following people: Also thanks to the following people:
Alexander Botero-Lowry Alexander Botero-Lowry

View File

@ -53,6 +53,10 @@ How many domains to show in domains by commits.
Maximum file extension length. Maximum file extension length.
=item processes
Number of concurrent processes to use when extracting git repository data.
=item project_name =item project_name
Project name to show on the generated pages. Default is to use basename of the repository directory. Project name to show on the generated pages. Default is to use basename of the repository directory.

110
gitstats
View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (c) 2007-2012 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/author.txt) # Copyright (c) 2007-2013 Heikki Hokkanen <hoxu@users.sf.net> & others (see doc/AUTHOR)
# GPLv2 / GPLv3 # GPLv2 / GPLv3
import datetime import datetime
import getopt import getopt
@ -14,6 +14,12 @@ import sys
import time import time
import zlib import zlib
if sys.version_info < (2, 6):
print >> sys.stderr, "Python 2.6 or higher is required for gitstats"
sys.exit(1)
from multiprocessing import Pool
os.environ['LC_ALL'] = 'C' os.environ['LC_ALL'] = 'C'
GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n' GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
@ -40,7 +46,8 @@ conf = {
'commit_end': 'HEAD', 'commit_end': 'HEAD',
'linear_linestats': 1, 'linear_linestats': 1,
'project_name': '', 'project_name': '',
'merge_authors': {} 'merge_authors': {},
'processes': 8,
} }
def getpipeoutput(cmds, quiet = False): def getpipeoutput(cmds, quiet = False):
@ -104,6 +111,20 @@ def getgitversion():
def getgnuplotversion(): def getgnuplotversion():
return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0] return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
def getnumoffilesfromrev(time_rev):
"""
Get number of files changed in commit
"""
time, rev = time_rev
return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
def getnumoflinesinblob(ext_blob):
"""
Get number of lines in blob
"""
ext, blob_id = ext_blob
return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0]))
class DataCollector: class DataCollector:
"""Manages data collection from a revision control repository.""" """Manages data collection from a revision control repository."""
def __init__(self): def __init__(self):
@ -408,14 +429,34 @@ class GitDataCollector(DataCollector):
# timezone # timezone
self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1 self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
# TODO Optimize this, it's the worst bottleneck
# outputs "<stamp> <files>" for each revision # outputs "<stamp> <files>" for each revision
revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n') revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
lines = [] lines = []
revs_to_read = []
time_rev_count = []
#Look up rev in cache and take info from cache if found
#If not append rev to list of rev to read from repo
for revline in revlines: for revline in revlines:
time, rev = revline.split(' ') time, rev = revline.split(' ')
linecount = self.getFilesInCommit(rev) #if cache empty then add time and rev to list of new rev's
lines.append('%d %d' % (int(time), linecount)) #otherwise try to read needed info from cache
if 'files_in_tree' not in self.cache.keys():
revs_to_read.append((time,rev))
continue
if rev in self.cache['files_in_tree'].keys():
lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
else:
revs_to_read.append((time,rev))
#Read revisions from repo
time_rev_count = Pool(processes=conf['processes']).map(getnumoffilesfromrev, revs_to_read)
#Update cache with new revisions and append then to general list
for (time, rev, count) in time_rev_count:
if 'files_in_tree' not in self.cache:
self.cache['files_in_tree'] = {}
self.cache['files_in_tree'][rev] = count
lines.append('%d %d' % (int(time), count))
self.total_commits += len(lines) self.total_commits += len(lines)
for line in lines: for line in lines:
@ -430,6 +471,7 @@ class GitDataCollector(DataCollector):
# extensions and size of files # extensions and size of files
lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000') lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
blobs_to_read = []
for line in lines: for line in lines:
if len(line) == 0: if len(line) == 0:
continue continue
@ -437,7 +479,7 @@ class GitDataCollector(DataCollector):
if parts[0] == '160000' and parts[3] == '-': if parts[0] == '160000' and parts[3] == '-':
# skip submodules # skip submodules
continue continue
sha1 = parts[2] blob_id = parts[2]
size = int(parts[3]) size = int(parts[3])
fullpath = parts[4] fullpath = parts[4]
@ -451,15 +493,28 @@ class GitDataCollector(DataCollector):
ext = filename[(filename.rfind('.') + 1):] ext = filename[(filename.rfind('.') + 1):]
if len(ext) > conf['max_ext_length']: if len(ext) > conf['max_ext_length']:
ext = '' ext = ''
if ext not in self.extensions: if ext not in self.extensions:
self.extensions[ext] = {'files': 0, 'lines': 0} self.extensions[ext] = {'files': 0, 'lines': 0}
self.extensions[ext]['files'] += 1 self.extensions[ext]['files'] += 1
try: #if cache empty then add ext and blob id to list of new blob's
self.extensions[ext]['lines'] += self.getLinesInBlob(sha1) #otherwise try to read needed info from cache
except: if 'lines_in_blob' not in self.cache.keys():
print 'Warning: Could not count lines for file "%s"' % line blobs_to_read.append((ext,blob_id))
continue
if blob_id in self.cache['lines_in_blob'].keys():
self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
else:
blobs_to_read.append((ext,blob_id))
#Get info abount line count for new blob's that wasn't found in cache
ext_blob_linecount = Pool(processes=24).map(getnumoflinesinblob, blobs_to_read)
#Update cache and write down info about number of number of lines
for (ext, blob_id, linecount) in ext_blob_linecount:
if 'lines_in_blob' not in self.cache:
self.cache['lines_in_blob'] = {}
self.cache['lines_in_blob'][blob_id] = linecount
self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
# line statistics # line statistics
# outputs: # outputs:
@ -619,33 +674,12 @@ class GitDataCollector(DataCollector):
def getDomains(self): def getDomains(self):
return self.domains.keys() return self.domains.keys()
def getFilesInCommit(self, rev):
try:
res = self.cache['files_in_tree'][rev]
except:
res = int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])
if 'files_in_tree' not in self.cache:
self.cache['files_in_tree'] = {}
self.cache['files_in_tree'][rev] = res
return res
def getFirstCommitDate(self): def getFirstCommitDate(self):
return datetime.datetime.fromtimestamp(self.first_commit_stamp) return datetime.datetime.fromtimestamp(self.first_commit_stamp)
def getLastCommitDate(self): def getLastCommitDate(self):
return datetime.datetime.fromtimestamp(self.last_commit_stamp) return datetime.datetime.fromtimestamp(self.last_commit_stamp)
def getLinesInBlob(self, sha1):
try:
res = self.cache['lines_in_blob'][sha1]
except:
res = int(getpipeoutput(['git cat-file blob %s' % sha1, 'wc -l']).split()[0])
if 'lines_in_blob' not in self.cache:
self.cache['lines_in_blob'] = {}
self.cache['lines_in_blob'][sha1] = res
return res
def getTags(self): def getTags(self):
lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3']) lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
return lines.split('\n') return lines.split('\n')
@ -1086,7 +1120,7 @@ class HTMLReportCreator(ReportCreator):
f.write('</dl>\n') f.write('</dl>\n')
f.write(html_header(2, 'Lines of Code')) f.write(html_header(2, 'Lines of Code'))
f.write('<img src="lines_of_code.png" />') f.write('<img src="lines_of_code.png" />')
fg = open(path + '/lines_of_code.dat', 'w') fg = open(path + '/lines_of_code.dat', 'w')
for stamp in sorted(data.changes_by_date.keys()): for stamp in sorted(data.changes_by_date.keys()):
@ -1280,7 +1314,9 @@ plot """
plots = [] plots = []
for a in self.authors_to_plot: for a in self.authors_to_plot:
i = i + 1 i = i + 1
plots.append("""'lines_of_code_by_author.dat' using 1:%d title "%s" w lines""" % (i, a.replace("\"", "\\\""))) a = a.replace("\"", "\\\"")
a = a.replace('`', '')
plots.append("""'lines_of_code_by_author.dat' using 1:%d title "%s" w lines""" % (i, a))
f.write(", ".join(plots)) f.write(", ".join(plots))
f.write('\n') f.write('\n')
@ -1307,7 +1343,9 @@ plot """
plots = [] plots = []
for a in self.authors_to_plot: for a in self.authors_to_plot:
i = i + 1 i = i + 1
plots.append("""'commits_by_author.dat' using 1:%d title "%s" w lines""" % (i, a.replace("\"", "\\\""))) a = a.replace("\"", "\\\"")
a = a.replace('`', '')
plots.append("""'commits_by_author.dat' using 1:%d title "%s" w lines""" % (i, a))
f.write(", ".join(plots)) f.write(", ".join(plots))
f.write('\n') f.write('\n')