From 6ea7955cefa4abfb6ad1426a8cbf29306aa4b05a Mon Sep 17 00:00:00 2001 From: wait321 Date: Sun, 9 Oct 2011 18:51:35 -0700 Subject: [PATCH] added script to search cards.txt and cards2.txt for missing urls and add them. the script queries magiccards.info --- scripts/generateurls.py | 109 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/generateurls.py diff --git a/scripts/generateurls.py b/scripts/generateurls.py new file mode 100644 index 0000000000..b081439da2 --- /dev/null +++ b/scripts/generateurls.py @@ -0,0 +1,109 @@ +import urllib +import urlparse +import re + +# this script will read in cards from a text file, search the magiccards.info site, and +# add urls for card images and info if they don't exist + +##################################################### +# variables to be changed + +path = "../resources/magic/data/" + +# input card files +ifile1 = open(path + "cards - Copy.txt", "r"); +ifile2 = open(path + "cards2 - Copy.txt", "r"); + +# output card files (these should be different from input files) +ofile1 = open(path + "cards.txt", "w"); +ofile2 = open(path + "cards2.txt", "w"); + + +##################################################### + +def url_fix(s, charset='utf-8'): + if isinstance(s, unicode): + s = s.encode(charset, 'ignore') + scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) + path = urllib.quote(path, '/%') + qs = urllib.quote_plus(qs, ':&=') + return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) + +def write_out_card(ofile, name, prevCardStr, foundImageUrl, foundInfoUrl): + # write out info for previous card + + ofile.write(">" + name + "\n") # name + + # generate urls + if((not foundImageUrl) or (not foundInfoUrl)): + queryUrl = url_fix("http://magiccards.info/query?q=" + name + "&v=card&s=cname") + + html = urllib.urlopen(queryUrl).read() + + # card info url + if (not foundInfoUrl): + print "generating info url for " + name + + pattern = "" + name + "" + m = re.findall(pattern, html, re.IGNORECASE) + + if (len(m) > 0): + ofile.write("url=http://magiccards.info" + m[0] + "\n") + else: + print "Unable to get info url for " + name + + # card image url + if (not foundImageUrl): + print "generating image url for " + name + + pattern = "img src=\"([^\"]*jpg)\"" + m = re.findall(pattern, html, re.IGNORECASE) + + if (len(m) > 0): + ofile.write("image=" + m[0] + "\n") + else: + print "Unable to get image url for " + name + + # write out rest of info to file + ofile.write(prevCardStr) + + +def generateURLs(ifile, ofile): + prevCardStr = "" + foundImageUrl = False + foundInfoUrl = False + name = "" + + while ifile: + line = ifile.readline(); + if(len(line) == 0): + if(len(name) > 0): + write_out_card(ofile, name, prevCardStr, foundImageUrl, foundInfoUrl) + break; + # ofile.write(line); + + i = line.find(">"); + if(i > -1): + # new card + + if(len(name) > 0): + write_out_card(ofile, name, prevCardStr, foundImageUrl, foundInfoUrl) + + # reset variables + name = line[1:-1] # set name to new card name + prevCardStr = "" + foundImageUrl = False + foundInfoUrl = False + else: + # card property + + prevCardStr += line + + if(line.find("url=") > -1): + foundInfoUrl = True + + if(line.find("image=") > -1): + foundImageUrl = True + +generateURLs(ifile1, ofile1); +generateURLs(ifile2, ofile2); \ No newline at end of file