Debian: rewrite @file, faster, liter, and more flexible (now uses Debian website instead of local copy)

2011-03-03 16:41:40 +01:00 · 2011-03-03 16:41:40 +01:00 · 5674e2a04f
parent cc6106e38e
commit 5674e2a04f
2 changed files with 74 additions and 214 deletions
--- a/Debian/plugin.py
+++ b/Debian/plugin.py
@ -29,11 +29,9 @@

 import os
 import re
-import gzip
 import time
-import popen2
+import urllib
 import fnmatch
-import threading

 import BeautifulSoup

@ -46,201 +44,76 @@ import supybot.ircutils as ircutils
 import supybot.callbacks as callbacks
 from supybot.utils.iter import all, imap, ifilter

-class PeriodicFileDownloader(object):
-    """A class to periodically download a file/files.
-
-    A class-level dictionary 'periodicFiles' maps names of files to
-    three-tuples of
-    (url, seconds between downloads, function to run with downloaded file).
-
-    'url' should be in some form that urllib2.urlopen can handle (do note that
-    urllib2.urlopen handles file:// links perfectly well.)
-
-    'seconds between downloads' is the number of seconds between downloads,
-    obviously.  An important point to remember, however, is that it is only
-    engaged when a command is run.  I.e., if you say you want the file
-    downloaded every day, but no commands that use it are run in a week, the
-    next time such a command is run, it'll be using a week-old file.  If you
-    don't want such behavior, you'll have to give an error mess age to the user
-    and tell him to call you back in the morning.
-
-    'function to run with downloaded file' is a function that will be passed
-    a string *filename* of the downloaded file.  This will be some random
-    filename probably generated via some mktemp-type-thing.  You can do what
-    you want with this; you may want to build a database, take some stats,
-    or simply rename the file.  You can pass None as your function and the
-    file with automatically be renamed to match the filename you have it listed
-    under.  It'll be in conf.supybot.directories.data, of course.
-
-    Aside from that dictionary, simply use self.getFile(filename) in any method
-    that makes use of a periodically downloaded file, and you'll be set.
-    """
-    periodicFiles = None
-    def __init__(self, *args, **kwargs):
-        if self.periodicFiles is None:
-            raise ValueError, 'You must provide files to download'
-        self.lastDownloaded = {}
-        self.downloadedCounter = {}
-        for filename in self.periodicFiles:
-            if self.periodicFiles[filename][-1] is None:
-                fullname = os.path.join(conf.supybot.directories.data(),
-                                        filename)
-                if os.path.exists(fullname):
-                    self.lastDownloaded[filename] = os.stat(fullname).st_ctime
-                else:
-                    self.lastDownloaded[filename] = 0
-            else:
-                self.lastDownloaded[filename] = 0
-            self.currentlyDownloading = set()
-            self.downloadedCounter[filename] = 0
-            self.getFile(filename)
-        super(PeriodicFileDownloader, self).__init__(*args, **kwargs)
-
-    def _downloadFile(self, filename, url, f):
-        self.currentlyDownloading.add(filename)
-        try:
-            try:
-                infd = utils.web.getUrlFd(url)
-            except IOError, e:
-                self.log.warning('Error downloading %s: %s', url, e)
-                return
-            except utils.web.Error, e:
-                self.log.warning('Error downloading %s: %s', url, e)
-                return
-            confDir = conf.supybot.directories.data()
-            newFilename = os.path.join(confDir, utils.file.mktemp())
-            outfd = file(newFilename, 'wb')
-            start = time.time()
-            s = infd.read(4096)
-            while s:
-                outfd.write(s)
-                s = infd.read(4096)
-            infd.close()
-            outfd.close()
-            self.log.info('Downloaded %s in %s seconds',
-                          filename, time.time()-start)
-            self.downloadedCounter[filename] += 1
-            self.lastDownloaded[filename] = time.time()
-            if f is None:
-                toFilename = os.path.join(confDir, filename)
-                if os.name == 'nt':
-                    # Windows, grrr...
-                    if os.path.exists(toFilename):
-                        os.remove(toFilename)
-                os.rename(newFilename, toFilename)
-            else:
-                start = time.time()
-                f(newFilename)
-                total = time.time() - start
-                self.log.info('Function ran on %s in %s seconds',
-                              filename, total)
-        finally:
-            self.currentlyDownloading.remove(filename)
-
-    def getFile(self, filename):
-        if world.documenting:
-            return
-        (url, timeLimit, f) = self.periodicFiles[filename]
-        if time.time() - self.lastDownloaded[filename] > timeLimit and \
-           filename not in self.currentlyDownloading:
-            self.log.info('Beginning download of %s', url)
-            args = (filename, url, f)
-            name = '%s #%s' % (filename, self.downloadedCounter[filename])
-            t = threading.Thread(target=self._downloadFile, name=name,
-                                 args=(filename, url, f))
-            t.setDaemon(True)
-            t.start()
-            world.threadsSpawned += 1
-
-
-class Debian(callbacks.Plugin, PeriodicFileDownloader):
+class Debian(callbacks.Plugin):
    threaded = True
-    periodicFiles = {
-        # This file is only updated once a week, so there's no sense in
-        # downloading a new one every day.
-        'Contents-i386.gz': ('ftp://ftp.us.debian.org/'
-                             'debian/dists/unstable/Contents-i386.gz',
-                             604800, None)
-        }

-    def __init__(self, irc):
-        callbacks.Plugin.__init__(self, irc)
-        PeriodicFileDownloader.__init__(self)
+    _debreflags = re.DOTALL | re.MULTILINE
+    _deblistreFileExact = re.compile(r'<a href="/[^/>]+/[^/>]+">([^<]+)</a>',
+                                     _debreflags)
+    def file(self, irc, msg, args, optlist, filename):
+        """[--exact]
+        [--mode {path,filename,exactfilename}]
+        [--branch {oldstable,stable,testing,unstable,experimental}] \
+        [--section {main,contrib,non-free}] <file name>

-    contents = conf.supybot.directories.data.dirize('Contents-i386.gz')
-    def file(self, irc, msg, args, optlist, glob):
-        """[--{regexp,exact} <value>] [<glob>]
-
-        Returns packages in Debian that includes files matching <glob>. If
-        --regexp is given, returns packages that include files matching the
-        given regexp.  If --exact is given, returns packages that include files
-        matching exactly the string given.
-        """
-        self.getFile('Contents-i386.gz')
-        # Make sure it's anchored, make sure it doesn't have a leading slash
-        # (the filenames don't have leading slashes, and people may not know
-        # that).
-        if not optlist and not glob:
-            raise callbacks.ArgumentError
-        if optlist and glob:
-            irc.error('You must specify either a glob or a regexp/exact '
-                      'search, but not both.', Raise=True)
-        for (option, arg) in optlist:
-            if option == 'exact':
-                regexp = arg.lstrip('/')
-            elif option == 'regexp':
-                regexp = arg
-        if glob:
-            regexp = fnmatch.translate(glob.lstrip('/'))
-            regexp = regexp.rstrip('$')
-            regexp = "%s.* " % regexp
+        Returns the package(s) containing the <file name>.
+        --mode defaults to path, and defines how to search.
+        --branch defaults to stable, and defines in what branch to search."""
+        url = 'http://packages.debian.org/search?searchon=contents' + \
+              '&keywords=%(keywords)s&mode=%(mode)s&suite=%(suite)s' + \
+              '&arch=%(arch)s'
+        args = {'keywords': None, 'mode': 'path', 'suite': 'stable',
+                'arch': 'any'}
+        exact = ('exact', True) in optlist
+        for (key, value) in optlist:
+            if key == 'branch':
+                args['suite'] = value
+            elif key == 'arch':
+                args['arch'] = value
+            elif key == 'mode':
+                args['mode'] = value
+        responses = []
+        if '*' in filename:
+            irc.error('Wildcard characters can not be specified.', Raise=True)
+        args['keywords'] = urllib.quote(filename, '')
+        url %= args
        try:
-            re_obj = re.compile(regexp, re.I)
-        except re.error, e:
-            irc.error(format('Error in regexp: %s', e), Raise=True)
-        if self.registryValue('pythonZgrep'):
-            fd = gzip.open(self.contents)
-            r = imap(lambda tup: tup[0],
-                     ifilter(lambda tup: tup[0],
-                             imap(lambda line:(re_obj.search(line), line),fd)))
+            html = utils.web.getUrl(url)
+        except utils.web.Error, e:
+            irc.error(format('I couldn\'t reach the search page (%s).', e),
+                      Raise=True)
+        if 'is down at the moment' in html:
+            irc.error('Packages.debian.org is down at the moment.  '
+                      'Please try again later.', Raise=True)
+        step = 0
+        pkgs = []
+        for line in html.split('\n'):
+            if '<span class="keyword">' in line:
+                step += 1
+            elif step == 1 or (step >= 1 and not exact):
+                pkgs.extend(self._deblistreFileExact.findall(line))
+        if pkgs == []:
+            irc.reply(format('No filename found for %s (%s)',
+                      utils.web.urlunquote(filename), args['suite']))
        else:
-            try:
-                (r, w) = popen2.popen4(['zgrep', '-e', regexp, self.contents])
-                w.close()
-            except TypeError:
-                # We're on Windows.
-                irc.error('This command won\'t work on this platform.  '
-                          'If you think it should (i.e., you know that you '
-                          'have a zgrep binary somewhere) then file a bug '
-                          'about it at http://supybot.sf.net/ .', Raise=True)
-        packages = set()  # Make packages unique
-        try:
-            for line in r:
-                if len(packages) > 100:
-                    irc.error('More than 100 packages matched, '
-                              'please narrow your search.', Raise=True)
-                try:
-                    if hasattr(line, 'group'): # we're actually using
-                        line = line.group(0)   # pythonZgrep  :(
-                    (filename, pkg_list) = line.split()
-                    if filename == 'FILE':
-                        # This is the last line before the actual files.
-                        continue
-                except ValueError: # Unpack list of wrong size.
-                    continue       # We've not gotten to the files yet.
-                packages.update(pkg_list.split(','))
-        finally:
-            if hasattr(r, 'close'):
-                r.close()
-        if len(packages) == 0:
-            irc.reply('I found no packages with that file.')
-        else:
-            irc.reply(format('%L', sorted(packages)))
-    file = wrap(file, [getopts({'regexp':'regexpMatcher','exact':'something'}),
-                       additional('glob')])
+            irc.reply(format('%i matches found: %s (%s)',
+                          len(pkgs), '; '.join(pkgs), args['suite']))
+    file = wrap(file, [getopts({'exact': '',
+                                'branch': ('literal', ('oldstable',
+                                                       'stable',
+                                                       'testing',
+                                                       'unstable',
+                                                       'experimental')),
+                                'mode': ('literal', ('path',
+                                                     'exactfilename',
+                                                     'filename')),
+                                'arch': ('literal', ('main',
+                                                     'contrib',
+                                                     'non-free'))}),
+                                'text'])

    _debreflags = re.DOTALL | re.IGNORECASE
-    _deblistre = re.compile(r'<h3>Package ([^<]+)</h3>(.*?)</ul>', _debreflags)
+    _deblistreVersion = re.compile(r'<h3>Package ([^<]+)</h3>(.*?)</ul>', _debreflags)
    def version(self, irc, msg, args, optlist, package):
        """[--exact] \
        [--searchon {names,all,sourcenames}]
@ -279,7 +152,7 @@ class Debian(callbacks.Plugin, PeriodicFileDownloader):
        if 'is down at the moment' in html:
            irc.error('Packages.debian.org is down at the moment.  '
                      'Please try again later.', Raise=True)
-        pkgs = self._deblistre.findall(html)
+        pkgs = self._deblistreVersion.findall(html)
        if not pkgs:
            irc.reply(format('No package found for %s (%s)',
                      utils.web.urlunquote(package), args['suite']))
--- a/Debian/test.py
+++ b/Debian/test.py
@ -39,24 +39,6 @@ class DebianTestCase(PluginTestCase):
    cleanDataDir = False
    fileDownloaded = False
    if network:
-        def setUp(self, nick='test'):
-            PluginTestCase.setUp(self)
-            try:
-                datadir = conf.supybot.directories.data
-                if os.path.exists(datadir.dirize('Contents-i386.gz')):
-                    pass
-                else:
-                    print
-                    print "Downloading files, this may take awhile."
-                    filename = datadir.dirize('Contents-i386.gz')
-                    while not os.path.exists(filename):
-                        time.sleep(1)
-                    print "Download complete."
-                    print "Starting test ..."
-                    self.fileDownloaded = True
-            except KeyboardInterrupt:
-                pass
-
        def testDebBugNoHtml(self):
            self.assertNotRegexp('debian bug 287792', r'\<em\>')

@ -77,10 +59,15 @@ class DebianTestCase(PluginTestCase):
                              r'^No package.*')

        def testDebfile(self):
-            self.assertHelp('file')
-            if not self.fileDownloaded:
-                pass
-            self.assertRegexp('file --exact bin/gaim', r'net/gaim')
+            self.assertHelp('debian file')
+            self.assertRegexp('debian file oigrgrgregg',
+                              r'^No filename.*\(stable\)')
+            self.assertRegexp('debian file --branch unstable alkdjfad',
+                r'^No filename.*\(unstable\)')
+            self.assertRegexp('debian file --exact --branch stable /bin/sh',
+                              r'2 matches found:.*bash.*dash.*\(stable')
+            self.assertRegexp('debian file --branch stable /bin/sh',
+                              r'3 matches found:.*bash.*dash.*klibc-utils')

        def testDebincoming(self):
            self.assertNotError('incoming')