Supybot-plugins/Wikipedia/plugin.py

###
# Copyright (c) 2010, quantumlemur
# Copyright (c) 2011, Valentin Lorentz
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

###


import re
import string
import urllib
import StringIO
import lxml.html
from lxml import etree
import supybot.utils as utils
from supybot.commands import *
import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks

# plugins.wikipedia.snippetStyle in ['sentence','paragraph','none']


class Wikipedia(callbacks.Plugin):
    """Add the help for "@plugin help Wikipedia" here
    This should describe *how* to use this plugin."""
    threaded = True


    def wiki(self, irc, msg, args, search):
        """<search term>

        Returns the first paragraph of a Wikipedia article"""
        reply = ''
        # first, we get the page
        addr = 'http://en.wikipedia.org/wiki/Special:Search?search=%s' % \
                urllib.quote_plus(search)
        try:
            article = utils.web.getUrl(addr)
        except:
            irc.reply('Hmm, something went wrong fetching the page. '
                      'I\'m highlighting quantumlemur so he can take a look.')
            return
        # parse the page
        tree = lxml.html.document_fromstring(article)
        # check if it gives a "Did you mean..." redirect
        didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
                                '[@title="Special:Search"]')
        if didyoumean:
            redirect = didyoumean[0].text_content().strip()
            reply += ('I didn\'t find anything for "%s". Did you mean "%s"? ' %
                      (search, redirect))
            addr = 'http://en.wikipedia.org%s' % didyoumean[0].get('href')
            article = utils.web.getUrl(addr)
            tree = lxml.html.document_fromstring(article)
            search = redirect
        # check if it's a page of search results (rather than an article), and
        # if so, retrieve the first result
        searchresults = tree.xpath('//div[@class="searchresults"]/ul/li/a')
        if searchresults:
            redirect = searchresults[0].text_content().strip()
            reply += 'I didn\'t find anything for "%s", but here\'s the ' + \
                     'result for "%s": ' % (search, redirect)
            addr = 'http://en.wikipedia.org%s' % searchresults[0].get('href')
            article = utils.web.getUrl(addr)
            tree = lxml.html.document_fromstring(article)
            search = redirect
        # otherwise, simply return the title and whether it redirected
        else:
            redirect = re.search('\(Redirected from <a href=[^>]*>([^<]*)'
                                 '</a>\)', article)
            if redirect:
                redirect = tree.xpath('//div[@id="contentSub"]/a')[0]
                redirect = redirect.text_content().strip()
                title = tree.xpath('//*[@class="firstHeading"]')
                title = title[0].text_content().strip()
                reply += '"%s" (Redirect from "%s"): ' % (title, redirect)
        # extract the address we got it from
        addr = re.search('Retrieved from "<a href="([^"]*)">', article)
        addr = addr.group(1)
        # check if it's a disambiguation page
        disambig = tree.xpath('//table[@id="disambigbox"]')
        if disambig:
            disambig = tree.xpath('//div[@id="bodyContent"]/ul/li/a')
            disambig = disambig[:5]
            disambig = [item.text_content() for item in disambig]
            r = utils.str.commaAndify(disambig)
            reply += '%s is a disambiguation page. Possible results are: %s' %\
                     (addr, r)
        # or just as bad, a page listing events in that year
        elif re.search('This article is about the year [\d]*\. '
                       'For the [a-zA-Z ]* [\d]*, see', article):
            reply += ('"%s" is a page full of events that happened in that '
                      'year.  If you were looking for information about the '
                      'number itself, try searching for "%s_(number)", but '
                      'don\'t expect anything useful...') % (search, search)
        else:
            ##### etree!
            p = tree.xpath("//div[@id='bodyContent']/p[1]")[0]
            p = p.text_content()
            p = p.strip()
            p = p.encode('utf-8')
            # and finally, return what we've got
            reply += '%s %s' % (p, ircutils.bold(addr))
        irc.reply(reply)
    wiki = wrap(wiki, ['text'])


Class = Wikipedia


# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`###`
			`# Copyright (c) 2010, quantumlemur`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# Copyright (c) 2011, Valentin Lorentz`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`# All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions, and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions, and the following disclaimer in the`
			`# documentation and/or other materials provided with the distribution.`
			`# * Neither the name of the author of this software nor the name of`
			`# contributors to this software may be used to endorse or promote products`
			`# derived from this software without specific prior written consent.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`

			`###`


			`import re`
			`import string`
			`import urllib`
			`import StringIO`
			`import lxml.html`
			`from lxml import etree`
			`import supybot.utils as utils`
			`from supybot.commands import *`
			`import supybot.plugins as plugins`
			`import supybot.ircutils as ircutils`
			`import supybot.callbacks as callbacks`

			`# plugins.wikipedia.snippetStyle in ['sentence','paragraph','none']`


			`class Wikipedia(callbacks.Plugin):`
			`"""Add the help for "@plugin help Wikipedia" here`
			`This should describe how to use this plugin."""`
			`threaded = True`


			`def wiki(self, irc, msg, args, search):`
			`"""<search term>`

			`Returns the first paragraph of a Wikipedia article"""`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`reply = ''`
			`# first, we get the page`
			`addr = 'http://en.wikipedia.org/wiki/Special:Search?search=%s' % \`
			`urllib.quote_plus(search)`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`try:`
			`article = utils.web.getUrl(addr)`
			`except:`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`irc.reply('Hmm, something went wrong fetching the page. '`
			`'I\'m highlighting quantumlemur so he can take a look.')`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`return`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# parse the page`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`tree = lxml.html.document_fromstring(article)`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# check if it gives a "Did you mean..." redirect`
			`didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'`
			`'[@title="Special:Search"]')`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`if didyoumean:`
			`redirect = didyoumean[0].text_content().strip()`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`reply += ('I didn\'t find anything for "%s". Did you mean "%s"? ' %`
			`(search, redirect))`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`addr = 'http://en.wikipedia.org%s' % didyoumean[0].get('href')`
			`article = utils.web.getUrl(addr)`
			`tree = lxml.html.document_fromstring(article)`
			`search = redirect`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# check if it's a page of search results (rather than an article), and`
			`# if so, retrieve the first result`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`searchresults = tree.xpath('//div[@class="searchresults"]/ul/li/a')`
			`if searchresults:`
			`redirect = searchresults[0].text_content().strip()`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`reply += 'I didn\'t find anything for "%s", but here\'s the ' + \`
			`'result for "%s": ' % (search, redirect)`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`addr = 'http://en.wikipedia.org%s' % searchresults[0].get('href')`
			`article = utils.web.getUrl(addr)`
			`tree = lxml.html.document_fromstring(article)`
			`search = redirect`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# otherwise, simply return the title and whether it redirected`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`else:`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`redirect = re.search('\(Redirected from <a href=[^>]>([^<])'`
			`'</a>\)', article)`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`if redirect:`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`redirect = tree.xpath('//div[@id="contentSub"]/a')[0]`
			`redirect = redirect.text_content().strip()`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`title = tree.xpath('//*[@class="firstHeading"]')`
			`title = title[0].text_content().strip()`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`reply += '"%s" (Redirect from "%s"): ' % (title, redirect)`
			`# extract the address we got it from`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`addr = re.search('Retrieved from "<a href="([^"]*)">', article)`
			`addr = addr.group(1)`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# check if it's a disambiguation page`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`disambig = tree.xpath('//table[@id="disambigbox"]')`
			`if disambig:`
			`disambig = tree.xpath('//div[@id="bodyContent"]/ul/li/a')`
			`disambig = disambig[:5]`
			`disambig = [item.text_content() for item in disambig]`
			`r = utils.str.commaAndify(disambig)`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`reply += '%s is a disambiguation page. Possible results are: %s' %\`
			`(addr, r)`
			`# or just as bad, a page listing events in that year`
			`elif re.search('This article is about the year [\d]*\. '`
			`'For the [a-zA-Z ]* [\d]*, see', article):`
			`reply += ('"%s" is a page full of events that happened in that '`
			`'year. If you were looking for information about the '`
			`'number itself, try searching for "%s_(number)", but '`
			`'don\'t expect anything useful...') % (search, search)`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`else:`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`##### etree!`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`p = tree.xpath("//div[@id='bodyContent']/p[1]")[0]`
			`p = p.text_content()`
			`p = p.strip()`
			`p = p.encode('utf-8')`
Wikipedia: clean plugin and add test case. 2011-03-01 07:39:35 -08:00			`# and finally, return what we've got`
			`reply += '%s %s' % (p, ircutils.bold(addr))`
			`irc.reply(reply)`
Wikipedia: import from quantumlemur repository. 2011-03-01 07:00:46 -08:00			`wiki = wrap(wiki, ['text'])`



			`Class = Wikipedia`


			`# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:`