109 lines
2.8 KiB
Python
109 lines
2.8 KiB
Python
|
import urllib
|
||
|
import urlparse
|
||
|
import re
|
||
|
|
||
|
# this script will read in cards from a text file, search the magiccards.info site, and
|
||
|
# add urls for card images and info if they don't exist
|
||
|
|
||
|
#####################################################
|
||
|
# variables to be changed
|
||
|
|
||
|
path = "../resources/magic/data/"
|
||
|
|
||
|
# input card files
|
||
|
ifile1 = open(path + "cards - Copy.txt", "r");
|
||
|
ifile2 = open(path + "cards2 - Copy.txt", "r");
|
||
|
|
||
|
# output card files (these should be different from input files)
|
||
|
ofile1 = open(path + "cards.txt", "w");
|
||
|
ofile2 = open(path + "cards2.txt", "w");
|
||
|
|
||
|
|
||
|
#####################################################
|
||
|
|
||
|
def url_fix(s, charset='utf-8'):
|
||
|
if isinstance(s, unicode):
|
||
|
s = s.encode(charset, 'ignore')
|
||
|
scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
|
||
|
path = urllib.quote(path, '/%')
|
||
|
qs = urllib.quote_plus(qs, ':&=')
|
||
|
return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
|
||
|
|
||
|
def write_out_card(ofile, name, prevCardStr, foundImageUrl, foundInfoUrl):
|
||
|
# write out info for previous card
|
||
|
|
||
|
ofile.write(">" + name + "\n") # name
|
||
|
|
||
|
# generate urls
|
||
|
if((not foundImageUrl) or (not foundInfoUrl)):
|
||
|
queryUrl = url_fix("http://magiccards.info/query?q=" + name + "&v=card&s=cname")
|
||
|
|
||
|
html = urllib.urlopen(queryUrl).read()
|
||
|
|
||
|
# card info url
|
||
|
if (not foundInfoUrl):
|
||
|
print "generating info url for " + name
|
||
|
|
||
|
pattern = "<a href=\"([^\"]*)\">" + name + "</a>"
|
||
|
m = re.findall(pattern, html, re.IGNORECASE)
|
||
|
|
||
|
if (len(m) > 0):
|
||
|
ofile.write("url=http://magiccards.info" + m[0] + "\n")
|
||
|
else:
|
||
|
print "Unable to get info url for " + name
|
||
|
|
||
|
# card image url
|
||
|
if (not foundImageUrl):
|
||
|
print "generating image url for " + name
|
||
|
|
||
|
pattern = "img src=\"([^\"]*jpg)\""
|
||
|
m = re.findall(pattern, html, re.IGNORECASE)
|
||
|
|
||
|
if (len(m) > 0):
|
||
|
ofile.write("image=" + m[0] + "\n")
|
||
|
else:
|
||
|
print "Unable to get image url for " + name
|
||
|
|
||
|
# write out rest of info to file
|
||
|
ofile.write(prevCardStr)
|
||
|
|
||
|
|
||
|
def generateURLs(ifile, ofile):
|
||
|
prevCardStr = ""
|
||
|
foundImageUrl = False
|
||
|
foundInfoUrl = False
|
||
|
name = ""
|
||
|
|
||
|
while ifile:
|
||
|
line = ifile.readline();
|
||
|
if(len(line) == 0):
|
||
|
if(len(name) > 0):
|
||
|
write_out_card(ofile, name, prevCardStr, foundImageUrl, foundInfoUrl)
|
||
|
break;
|
||
|
# ofile.write(line);
|
||
|
|
||
|
i = line.find(">");
|
||
|
if(i > -1):
|
||
|
# new card
|
||
|
|
||
|
if(len(name) > 0):
|
||
|
write_out_card(ofile, name, prevCardStr, foundImageUrl, foundInfoUrl)
|
||
|
|
||
|
# reset variables
|
||
|
name = line[1:-1] # set name to new card name
|
||
|
prevCardStr = ""
|
||
|
foundImageUrl = False
|
||
|
foundInfoUrl = False
|
||
|
else:
|
||
|
# card property
|
||
|
|
||
|
prevCardStr += line
|
||
|
|
||
|
if(line.find("url=") > -1):
|
||
|
foundInfoUrl = True
|
||
|
|
||
|
if(line.find("image=") > -1):
|
||
|
foundImageUrl = True
|
||
|
|
||
|
generateURLs(ifile1, ofile1);
|
||
|
generateURLs(ifile2, ofile2);
|