Analysing SVN Commits Across Committer Groups

Here is an example of using the SVNKit API to crawl a SVN repository and pick up the commit sizes. It uses a very simple (and incorrect) heuristic for estimating the number of lines changed per commit – it just gets the absolute value of the difference of the numer of lines added and subtracted per commit.

The code below will produce a comma-separated values file containing the author, commit time, line change count estimate, and revision number.

Loading the resulting file into R allows us to apply some analysis. We can plot the total number of commits per comitter:

Commits By Author

Commits By Author

Or look at the total number of lines committed on each commit:


And look at some summary stats (again, per author):

$user1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 1.0 5.0 439.3 45.5 45100.0

$user2
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 3.0 26.0 294.9 105.5 62700.0

$user3
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 1.00 1.00 46.64 5.00 22300.00

$user4
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 5.5 51.0 225.5 166.0 1882.0

$user5
Min. 1st Qu. Median Mean 3rd Qu. Max.
39.0 108.0 267.0 231.4 298.0 445.0

$user6
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 2.0 7.0 181.3 41.0 21170.0

$user7
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 5.0 34.5 164.8 136.0 3066.0

You can see from the entries for the first couple of authors above that the mean is skewed by some very large commits – making the median a much more robust measure of average lines per commit.

package com.researchkitchen.svn;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.ArrayList;

import org.tmatesoft.svn.core.SVNException;
import org.tmatesoft.svn.core.SVNLogEntry;
import org.tmatesoft.svn.core.SVNURL;
import org.tmatesoft.svn.core.auth.ISVNAuthenticationManager;
import org.tmatesoft.svn.core.internal.io.svn.SVNRepositoryFactoryImpl;
import org.tmatesoft.svn.core.io.SVNRepository;
import org.tmatesoft.svn.core.io.SVNRepositoryFactory;
import org.tmatesoft.svn.core.wc.SVNClientManager;
import org.tmatesoft.svn.core.wc.SVNDiffClient;
import org.tmatesoft.svn.core.wc.SVNRevision;
import org.tmatesoft.svn.core.wc.SVNWCUtil;

public class SVNClient {

  @SuppressWarnings("unchecked")
   public static void main(String[] args) throws IOException {
     final String url = "svn://myserver/myproject/trunk";
     final String name = "rory";
     final String pass = "password";
     BufferedWriter writer = new BufferedWriter(new FileWriter(new File("svn-stats.dat")));
     SimpleDateFormat formatter = new SimpleDateFormat("dd/M/yyyy HH:mm:ss");

    try {
       SVNRepositoryFactoryImpl.setup();
       SVNURL svnUrl = SVNURL.parseURIDecoded(url);
       ISVNAuthenticationManager authManager = SVNWCUtil.createDefaultAuthenticationManager(name, pass);
       SVNRepository repo = SVNRepositoryFactory.create(svnUrl);
       repo.setAuthenticationManager(authManager);

      // Create a diff client
       SVNClientManager clientManager = SVNClientManager.newInstance();
       SVNDiffClient diffClient = clientManager.getDiffClient();

      writer.write("Revision,Author,Date,LinesChanged\n");

      // Get svn log for entire repo history
       long currentRev = repo.getLatestRevision();
       ArrayList<SVNLogEntry> entries = new ArrayList<SVNLogEntry>(repo.log(new String[] {""}, null, 1, currentRev, true, true));

      // Diff all subsequent revisions
       for (int i = 1; i < entries.size(); ++i) {
         int changedThisCommit = 0;
         SVNLogEntry current = entries.get(i);
         SVNLogEntry prev = entries.get(i-1);

        System.out.println("Revision " + current.getRevision()
             + " committed by " + current.getAuthor());

        ByteArrayOutputStream io = new ByteArrayOutputStream();
         System.out.println("Diff between " + current.getRevision() + "=>" + prev.getRevision() + ":");
         diffClient.doDiff(svnUrl, SVNRevision.HEAD, SVNRevision.create(prev.getRevision()),
             SVNRevision.create(current.getRevision()), true, false,io);

        // Very basic (and probably wrong) changed lines metric
         // see http://en.wikipedia.org/wiki/Diff#Unified_format
         BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(io.toByteArray())));
         String line = null;
         while((line = br.readLine()) != null) {
           if (line.matches("^\\+([^\\+]).*"))
             changedThisCommit++;
           else if (line.matches("^\\-([^\\-]).*"))
             changedThisCommit--;
         }
         changedThisCommit = (changedThisCommit < 0 ? -changedThisCommit : changedThisCommit) + 1;
         System.out.println("Lines changed this commit:" + changedThisCommit);
         br.close();

        writer.write(current.getRevision() + "," + current.getAuthor() + ","
             + formatter.format(current.getDate()) + "," + changedThisCommit + "\n");      
       }
       writer.close();
     } catch (SVNException se) {
       se.printStackTrace();
     } catch (FileNotFoundException e) {
       e.printStackTrace();
     }
   }
}