Added basic post scraper processor
parent
0f49346bbc
commit
5aeed3ae42
|
@ -1,5 +1,131 @@
|
|||
module Scraper
|
||||
class PostProcessor < Scraper::BaseProcessor
|
||||
# regexp %r{\Ahttps?://(www\.)?factorioforums.com/forum/viewtopic\.php\?f=[0-9]+\Z}
|
||||
regexp %r{\Ahttps?://(www\.)?factorioforums.com/forum/viewtopic\.php\?f=[0-9]+&t=[0-9]+(&start=0)?\Z}
|
||||
|
||||
def process_page
|
||||
post = {}
|
||||
c = @doc.search('.content').first
|
||||
|
||||
urls = c.to_s.scan(%r{href="([^"]+)"}).flatten.map
|
||||
|
||||
# Github URL
|
||||
if ( github = urls.detect{|url| url.match('github.com')} )
|
||||
post[:github_url] = github.gsub(%r{(github.com/[^/]+/[^/]+).*}, '\1')
|
||||
else
|
||||
post[:github_url] = nil
|
||||
end
|
||||
|
||||
# Download URLs
|
||||
if ( download_url = urls.detect{|url| url.match('download/file.php')} )
|
||||
request_uri = URI.parse(@request.url)
|
||||
base_path = request_uri.scheme + '://' + request_uri.host + request_uri.path.sub(/[^\/]+$/, '')
|
||||
download_url = download_url.gsub(%r{&sid=[0-9a-z]+}i, '').gsub(/^.\//, base_path)
|
||||
post[:download_url] = download_url
|
||||
else
|
||||
post[:download_url] = nil
|
||||
end
|
||||
|
||||
# Summary
|
||||
summary = c.search('ul').first.children.take_while{|node| node.name != 'li'}.map(&:text).join.strip
|
||||
post[:summary] = summary
|
||||
|
||||
# Title
|
||||
title = @doc.search('#page-body h2').first.text
|
||||
post[:title] = title = title.gsub(/\[[^\]]+\]/, '').gsub(/\([^)]+\)/, '').strip
|
||||
|
||||
# Last edit time
|
||||
last_edited_at = @doc.search('.postbody .notice').to_s.match(/(?<=<\/a> on ).+?(am|pm)/)
|
||||
if last_edited_at
|
||||
post[:last_edited_at] = last_edited_at = DateTime.parse last_edited_at.to_s
|
||||
end
|
||||
|
||||
# Long Description
|
||||
description = c.text.match(/Long Description:?(.*?)(?:Bugs|$)/)
|
||||
if description
|
||||
post[:description] = description[1]
|
||||
end
|
||||
|
||||
### Parsing the info list
|
||||
#########################
|
||||
|
||||
@info = info = []
|
||||
c.search('ul').first.search('li').each do |li|
|
||||
splitted = li.text.split(':')
|
||||
info.push(
|
||||
key: splitted.shift.strip,
|
||||
value: splitted.join(':').strip,
|
||||
html: li
|
||||
)
|
||||
end
|
||||
|
||||
# Extract file name, mod name, and mod version
|
||||
if ( file_name = match_info(/\.zip$/, :value) )
|
||||
if ( file_name = file_name[:value].match(/[^\s]+[_-][0-9\.]+\.zip/) )
|
||||
post[:file_name] = file_name = file_name.to_s
|
||||
mod_version = file_name.match(/[_-]([0-9\.]+)\.zip/)
|
||||
if mod_version
|
||||
post[:version] = mod_version = mod_version[1]
|
||||
mod_name = file_name.match(/^(.*?)[_-]\d/)
|
||||
if mod_name
|
||||
post[:mod_name] = mod_name = mod_name[1]
|
||||
end
|
||||
end
|
||||
end
|
||||
else
|
||||
|
||||
end
|
||||
|
||||
# Extract tags/categories
|
||||
if ( tags = match_info(/tags?|categories|category/i) )
|
||||
post[:tags] = tags[:value].split(',').map(&:strip)
|
||||
end
|
||||
|
||||
# Extract contact info
|
||||
if ( contact = match_info(/Contact/) )
|
||||
if ( contact_url = URI.extract(contact[:html].to_s).detect{|url| url =~ /^http/} )
|
||||
post[:contact] = contact_url
|
||||
else
|
||||
post[:contact] = contact[:value].strip
|
||||
end
|
||||
end
|
||||
|
||||
# Extract author/authors
|
||||
if ( authors = match_info(/authors?/i) )
|
||||
post[:authors] = authors[:value].split(',').map(&:strip)
|
||||
end
|
||||
|
||||
# Extract license
|
||||
if ( license = match_info(/license/i) )
|
||||
post[:license] = license[:value]
|
||||
urls = extract_urls(license[:html])
|
||||
unless urls.empty?
|
||||
post[:license_url] = urls.first
|
||||
end
|
||||
end
|
||||
|
||||
# Extract game version
|
||||
if ( game_version = match_info(/factorio/i) )
|
||||
post[:game_version] = game_version[:value].split(/[-,]/).map(&:strip)
|
||||
end
|
||||
|
||||
# Extract last release version
|
||||
if ( row = match_info(/latest release|last release/i) )
|
||||
if ( date = row[:value].match(/(?<=,).*/) )
|
||||
post[:last_release_at] = date = DateTime.parse(date.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
post
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def match_info(regex, type = :key)
|
||||
@info.detect{|pair| pair[type].match(regex) }
|
||||
end
|
||||
|
||||
def extract_urls(text)
|
||||
URI.extract(text.to_s).select{|url| url =~ /^http/}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -649,7 +649,7 @@ http_interactions:
|
|||
by <a href=\"http://www.phpbb.com/\">phpBB</a>® Forum Software ©
|
||||
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
|
||||
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
|
||||
http_version:
|
||||
http_version:
|
||||
recorded_at: Wed, 22 Jul 2015 18:35:13 GMT
|
||||
- request:
|
||||
method: get
|
||||
|
@ -1300,7 +1300,7 @@ http_interactions:
|
|||
by <a href=\"http://www.phpbb.com/\">phpBB</a>® Forum Software ©
|
||||
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
|
||||
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
|
||||
http_version:
|
||||
http_version:
|
||||
recorded_at: Wed, 22 Jul 2015 18:35:14 GMT
|
||||
- request:
|
||||
method: get
|
||||
|
@ -1898,7 +1898,7 @@ http_interactions:
|
|||
by <a href=\"http://www.phpbb.com/\">phpBB</a>® Forum Software ©
|
||||
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
|
||||
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
|
||||
http_version:
|
||||
http_version:
|
||||
recorded_at: Wed, 22 Jul 2015 18:35:15 GMT
|
||||
- request:
|
||||
method: get
|
||||
|
@ -2549,7 +2549,7 @@ http_interactions:
|
|||
by <a href=\"http://www.phpbb.com/\">phpBB</a>® Forum Software ©
|
||||
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
|
||||
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
|
||||
http_version:
|
||||
http_version:
|
||||
recorded_at: Wed, 22 Jul 2015 18:35:15 GMT
|
||||
- request:
|
||||
method: get
|
||||
|
@ -3189,6 +3189,6 @@ http_interactions:
|
|||
by <a href=\"http://www.phpbb.com/\">phpBB</a>® Forum Software ©
|
||||
phpBB Group\n\t\t\n\t</div>\n</div>\n\n</div>\n\n<div>\n\t<a id=\"bottom\"
|
||||
name=\"bottom\" accesskey=\"z\"></a>\n\t\n</div>\n\n</body>\n</html>"
|
||||
http_version:
|
||||
http_version:
|
||||
recorded_at: Wed, 22 Jul 2015 18:35:15 GMT
|
||||
recorded_with: VCR 2.9.3
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,10 +1,68 @@
|
|||
describe Scraper::PostProcessor, vcr: { cassette_name: 'forum_post', record: :new_episodes } do
|
||||
describe Scraper::PostProcessor do
|
||||
def scrap(page_url)
|
||||
@scraper = Scraper::Base.new page_url, Scraper::SubforumProcessor
|
||||
@result = @scraper.scrap
|
||||
VCR.use_cassette('forum_post', record: :new_episodes) do
|
||||
@scraper = Scraper::Base.new page_url, Scraper::PostProcessor
|
||||
@result = @scraper.scrap
|
||||
end
|
||||
end
|
||||
|
||||
# describe 'URL detection' do
|
||||
# it 'should detect forum posts pages'
|
||||
# end
|
||||
describe 'URL detection' do
|
||||
it 'should detect forum posts pages URLs' do
|
||||
expect{ scrap('http://www.factorioforums.com/forum/viewtopic.php?f=93&t=14371') }.to_not raise_error
|
||||
end
|
||||
|
||||
it 'should raise an error with a non-post URL' do
|
||||
expect{ scrap('http://www.factorioforums.com/forum/viewforum.php?f=83') }.to raise_error Scraper::NoPageProcessorFoundError
|
||||
end
|
||||
|
||||
it 'should raise an error with pagination' do
|
||||
expect{ scrap('http://www.factorioforums.com/forum/viewtopic.php?f=43&t=6456&start=20') }.to raise_error Scraper::NoPageProcessorFoundError
|
||||
end
|
||||
end
|
||||
|
||||
describe 'Post layouts detection' do
|
||||
describe 'pretty perfect template layout' do
|
||||
before(:all){ scrap 'http://www.factorioforums.com/forum/viewtopic.php?f=91&t=14294' }
|
||||
subject{ @result.first }
|
||||
|
||||
its([:summary]) { is_expected.to eq '' }
|
||||
its([:title]) { is_expected.to eq 'Science Cost Tweaker Mod' }
|
||||
its([:mod_name]) { is_expected.to eq 'ScienceCostTweaker' }
|
||||
its([:description]) { is_expected.to eq 'This mod can be used as a simple alternative to marathon mod. It increases science costs significantly (4x to 9x depending on tier) - you need bigger factory to feed your science labs. Science also now has its own dedicated production lines and intermediate products. No more making science packs from conveyor belts and inserters!' }
|
||||
its([:game_version]) { is_expected.to eq ['0.12.x'] }
|
||||
its([:download_url]) { is_expected.to eq 'http://www.factorioforums.com/forum/download/file.php?id=4985' }
|
||||
its([:file_name]) { is_expected.to eq 'ScienceCostTweaker_0.12.4.zip' }
|
||||
its([:version]) { is_expected.to eq '0.12.4' }
|
||||
its([:last_release_at]) { is_expected.to be_within(1.hour).of DateTime.parse('July 30, 2015') }
|
||||
its([:license]) { is_expected.to eq 'GPL.' }
|
||||
its([:license_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker/blob/master/LICENSE'}
|
||||
its([:github_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
|
||||
its([:authors]) { is_expected.to eq ['UberWaffe'] }
|
||||
its([:contact]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
|
||||
its([:last_edited_at]) { is_expected.to be_within(1.hour).of DateTime.parse('Jul 31, 2015 11:46 am') }
|
||||
its([:tags]) { is_expected.to eq ['Technology', 'Difficulty', 'Game Length'] }
|
||||
end
|
||||
|
||||
# describe 'a layout with a non-standard info-list' do
|
||||
# before(:all){ scrap 'http://www.factorioforums.com/forum/viewtopic.php?f=92&t=13937' }
|
||||
# subject{ @result.first }
|
||||
#
|
||||
# its([:summary]) { is_expected.to eq 'Show GUI messages, controlled via the circuit network.' }
|
||||
# its([:title]) { is_expected.to eq 'Circuit GUI' }
|
||||
# its([:mod_name]) { is_expected.to eq 'ScienceCostTweaker' }
|
||||
# its([:description]) { is_expected.to eq '' }
|
||||
# its([:game_version]) { is_expected.to eq ['0.12.x'] }
|
||||
# its([:download_url]) { is_expected.to eq 'http://www.factorioforums.com/forum/download/file.php?id=4985' }
|
||||
# its([:file_name]) { is_expected.to eq 'ScienceCostTweaker_0.12.4.zip' }
|
||||
# its([:version]) { is_expected.to eq '0.12.4' }
|
||||
# its([:last_release_at]) { is_expected.to be_within(1.hour).of DateTime.parse('July 30, 2015') }
|
||||
# its([:license]) { is_expected.to eq 'GPL.' }
|
||||
# its([:license_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker/blob/master/LICENSE'}
|
||||
# its([:github_url]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
|
||||
# its([:authors]) { is_expected.to eq ['UberWaffe'] }
|
||||
# its([:contact]) { is_expected.to eq 'https://github.com/UberWaffe/ScienceCostTweaker' }
|
||||
# its([:last_edited_at]) { is_expected.to be_within(1.hour).of DateTime.parse('Jul 31, 2015 11:46 am') }
|
||||
# its([:tags]) { is_expected.to eq ['Technology', 'Difficulty', 'Game Length'] }
|
||||
# end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue