Added basic post scraper processor

master
Zequez 2015-08-01 23:17:25 -03:00
parent 0f49346bbc
commit 5aeed3ae42
4 changed files with 2313 additions and 12 deletions

View File

@ -1,5 +1,131 @@
module Scraper
class PostProcessor < Scraper::BaseProcessor
# regexp %r{\Ahttps?://(www\.)?factorioforums.com/forum/viewtopic\.php\?f=[0-9]+\Z}
regexp %r{\Ahttps?://(www\.)?factorioforums.com/forum/viewtopic\.php\?f=[0-9]+&t=[0-9]+(&start=0)?\Z}
def process_page
post = {}
c = @doc.search('.content').first
urls = c.to_s.scan(%r{href="([^"]+)"}).flatten.map
# Github URL
if ( github = urls.detect{|url| url.match('github.com')} )
post[:github_url] = github.gsub(%r{(github.com/[^/]+/[^/]+).*}, '\1')
else
post[:github_url] = nil
end
# Download URLs
if ( download_url = urls.detect{|url| url.match('download/file.php')} )
request_uri = URI.parse(@request.url)
base_path = request_uri.scheme + '://' + request_uri.host + request_uri.path.sub(/[^\/]+$/, '')
download_url = download_url.gsub(%r{&amp;sid=[0-9a-z]+}i, '').gsub(/^.\//, base_path)
post[:download_url] = download_url
else
post[:download_url] = nil
end
# Summary
summary = c.search('ul').first.children.take_while{|node| node.name != 'li'}.map(&:text).join.strip
post[:summary] = summary
# Title
title = @doc.search('#page-body h2').first.text
post[:title] = title = title.gsub(/\[[^\]]+\]/, '').gsub(/\([^)]+\)/, '').strip
# Last edit time
last_edited_at = @doc.search('.postbody .notice').to_s.match(/(?<=<\/a> on ).+?(am|pm)/)
if last_edited_at
post[:last_edited_at] = last_edited_at = DateTime.parse last_edited_at.to_s
end
# Long Description
description = c.text.match(/Long Description:?(.*?)(?:Bugs|$)/)
if description
post[:description] = description[1]
end
### Parsing the info list
#########################
@info = info = []
c.search('ul').first.search('li').each do |li|
splitted = li.text.split(':')
info.push(
key: splitted.shift.strip,
value: splitted.join(':').strip,
html: li
)
end
# Extract file name, mod name, and mod version
if ( file_name = match_info(/\.zip$/, :value) )
if ( file_name = file_name[:value].match(/[^\s]+[_-][0-9\.]+\.zip/) )
post[:file_name] = file_name = file_name.to_s
mod_version = file_name.match(/[_-]([0-9\.]+)\.zip/)
if mod_version
post[:version] = mod_version = mod_version[1]
mod_name = file_name.match(/^(.*?)[_-]\d/)
if mod_name
post[:mod_name] = mod_name = mod_name[1]
end
end
end
else
end
# Extract tags/categories
if ( tags = match_info(/tags?|categories|category/i) )
post[:tags] = tags[:value].split(',').map(&:strip)
end
# Extract contact info
if ( contact = match_info(/Contact/) )
if ( contact_url = URI.extract(contact[:html].to_s).detect{|url| url =~ /^http/} )
post[:contact] = contact_url
else
post[:contact] = contact[:value].strip
end
end
# Extract author/authors
if ( authors = match_info(/authors?/i) )
post[:authors] = authors[:value].split(',').map(&:strip)
end
# Extract license
if ( license = match_info(/license/i) )
post[:license] = license[:value]
urls = extract_urls(license[:html])
unless urls.empty?
post[:license_url] = urls.first
end
end
# Extract game version
if ( game_version = match_info(/factorio/i) )
post[:game_version] = game_version[:value].split(/[-,]/).map(&:strip)
end
# Extract last release version
if ( row = match_info(/latest release|last release/i) )
if ( date = row[:value].match(/(?<=,).*/) )
post[:last_release_at] = date = DateTime.parse(date.to_s)
end
end
post
end
private
def match_info(regex, type = :key)
@info.detect{|pair| pair[type].match(regex) }
end
def extract_urls(text)
URI.extract(text.to_s).select{|url| url =~ /^http/}
end
end
end

View File

@ -649,7 +649,7 @@ http_interactions:
by <a href=\"http://www.phpbb.com/\">phpBB</a>&reg; Forum Software &copy;
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
http_version:
http_version:
recorded_at: Wed, 22 Jul 2015 18:35:13 GMT
- request:
method: get
@ -1300,7 +1300,7 @@ http_interactions:
by <a href=\"http://www.phpbb.com/\">phpBB</a>&reg; Forum Software &copy;
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
http_version:
http_version:
recorded_at: Wed, 22 Jul 2015 18:35:14 GMT
- request:
method: get
@ -1898,7 +1898,7 @@ http_interactions:
by <a href=\"http://www.phpbb.com/\">phpBB</a>&reg; Forum Software &copy;
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
http_version:
http_version:
recorded_at: Wed, 22 Jul 2015 18:35:15 GMT
- request:
method: get
@ -2549,7 +2549,7 @@ http_interactions:
by <a href=\"http://www.phpbb.com/\">phpBB</a>&reg; Forum Software &copy;
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
http_version:
http_version:
recorded_at: Wed, 22 Jul 2015 18:35:15 GMT
- request:
method: get
@ -3189,6 +3189,6 @@ http_interactions:
by <a href=\"http://www.phpbb.com/\">phpBB</a>&reg; Forum Software &copy;
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
http_version:
http_version:
recorded_at: Wed, 22 Jul 2015 18:35:15 GMT
recorded_with: VCR 2.9.3

2117
spec/fixtures/vcr_cassettes/forum_post.yml vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,68 @@
describe Scraper::PostProcessor, vcr: { cassette_name: 'forum_post', record: :new_episodes } do
describe Scraper::PostProcessor do
def scrap(page_url)
@scraper = Scraper::Base.new page_url, Scraper::SubforumProcessor
@result = @scraper.scrap
VCR.use_cassette('forum_post', record: :new_episodes) do
@scraper = Scraper::Base.new page_url, Scraper::PostProcessor
@result = @scraper.scrap
end
end
# describe 'URL detection' do
# it 'should detect forum posts pages'
# end
describe 'URL detection' do
it 'should detect forum posts pages URLs' do
expect{ scrap('http://www.factorioforums.com/forum/viewtopic.php?f=93&t=14371') }.to_not raise_error
end
it 'should raise an error with a non-post URL' do
expect{ scrap('http://www.factorioforums.com/forum/viewforum.php?f=83') }.to raise_error Scraper::NoPageProcessorFoundError
end
it 'should raise an error with pagination' do
expect{ scrap('http://www.factorioforums.com/forum/viewtopic.php?f=43&t=6456&start=20') }.to raise_error Scraper::NoPageProcessorFoundError
end
end
describe 'Post layouts detection' do
describe 'pretty perfect template layout' do
before(:all){ scrap 'http://www.factorioforums.com/forum/viewtopic.php?f=91&t=14294' }
subject{ @result.first }
its([:summary]) { is_expected.to eq '' }
its([:title]) { is_expected.to eq 'Science Cost Tweaker Mod' }
its([:mod_name]) { is_expected.to eq 'ScienceCostTweaker' }
its([:description]) { is_expected.to eq 'This mod can be used as a simple alternative to marathon mod. It increases science costs significantly (4x to 9x depending on tier) - you need bigger factory to feed your science labs. Science also now has its own dedicated production lines and intermediate products. No more making science packs from conveyor belts and inserters!' }
its([:game_version]) { is_expected.to eq ['0.12.x'] }
its([:download_url]) { is_expected.to eq 'http://www.factorioforums.com/forum/download/file.php?id=4985' }
its([:file_name]) { is_expected.to eq 'ScienceCostTweaker_0.12.4.zip' }
its([:version]) { is_expected.to eq '0.12.4' }
its([:last_release_at]) { is_expected.to be_within(1.hour).of DateTime.parse('July 30, 2015') }
its([:license]) { is_expected.to eq 'GPL.' }
its([:license_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker/blob/master/LICENSE'}
its([:github_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
its([:authors]) { is_expected.to eq ['UberWaffe'] }
its([:contact]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
its([:last_edited_at]) { is_expected.to be_within(1.hour).of DateTime.parse('Jul 31, 2015 11:46 am') }
its([:tags]) { is_expected.to eq ['Technology', 'Difficulty', 'Game Length'] }
end
# describe 'a layout with a non-standard info-list' do
# before(:all){ scrap 'http://www.factorioforums.com/forum/viewtopic.php?f=92&t=13937' }
# subject{ @result.first }
#
# its([:summary]) { is_expected.to eq 'Show GUI messages, controlled via the circuit network.' }
# its([:title]) { is_expected.to eq 'Circuit GUI' }
# its([:mod_name]) { is_expected.to eq 'ScienceCostTweaker' }
# its([:description]) { is_expected.to eq '' }
# its([:game_version]) { is_expected.to eq ['0.12.x'] }
# its([:download_url]) { is_expected.to eq 'http://www.factorioforums.com/forum/download/file.php?id=4985' }
# its([:file_name]) { is_expected.to eq 'ScienceCostTweaker_0.12.4.zip' }
# its([:version]) { is_expected.to eq '0.12.4' }
# its([:last_release_at]) { is_expected.to be_within(1.hour).of DateTime.parse('July 30, 2015') }
# its([:license]) { is_expected.to eq 'GPL.' }
# its([:license_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker/blob/master/LICENSE'}
# its([:github_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
# its([:authors]) { is_expected.to eq ['UberWaffe'] }
# its([:contact]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
# its([:last_edited_at]) { is_expected.to be_within(1.hour).of DateTime.parse('Jul 31, 2015 11:46 am') }
# its([:tags]) { is_expected.to eq ['Technology', 'Difficulty', 'Game Length'] }
# end
end
end