Strict Regex, wikipedia is not a git service

This commit is contained in:
SmallJoker 2018-04-01 21:09:12 +02:00
parent d104fbf1c7
commit 87424f2d03

View File

@ -136,7 +136,16 @@ namespace ModIndexer
// Download page and convert to a HtmlNode object // Download page and convert to a HtmlNode object
HtmlNodeCollection OpenPage(string url, string nodes) HtmlNodeCollection OpenPage(string url, string nodes)
{ {
string text = enc.GetString(cli.DownloadData(url)); string text = "";
while (text == "") {
try {
text = enc.GetString(cli.DownloadData(url));
} catch {
// Probably the web stuff threw an error
Console.WriteLine("Downloading/converting failed: " + url);
text = "";
}
}
HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(text); htmlDoc.LoadHtml(text);
@ -325,8 +334,6 @@ namespace ModIndexer
} }
} }
const int PRIORITY_WORST = 0xFFFF;
// Analyze topic contents and get link // Analyze topic contents and get link
void FetchSingleTopic(string mod_name, ref ForumData info) void FetchSingleTopic(string mod_name, ref ForumData info)
{ {
@ -354,7 +361,7 @@ namespace ModIndexer
} }
string link = ""; string link = "";
int uglyness = PRIORITY_WORST - 10; int quality = 0; // 0 to 10
foreach (HtmlNode dtNode in content) { foreach (HtmlNode dtNode in content) {
string url_raw = dtNode.GetAttributeValue("href", ""); string url_raw = dtNode.GetAttributeValue("href", "");
@ -366,17 +373,18 @@ namespace ModIndexer
string url_new; string url_new;
int priority = checkLinkPattern(url_raw, out url_new); int priority = checkLinkPattern(url_raw, out url_new);
// Weight the link to find the best matching
string lower = url_new.ToLower().Replace('-', '_'); string lower = url_new.ToLower().Replace('-', '_');
if (lower.Contains(mod_name)) if (lower.Contains(mod_name))
uglyness -= 3; priority += 3;
if (lower.Contains(info.userName.ToLower())) if (lower.Contains(info.userName.ToLower()))
uglyness--; priority++;
if (priority < uglyness) { if (priority > quality) {
if (isLinkAvailable(ref url_new)) { if (isLinkAvailable(ref url_new)) {
// Best link so far. Take it. // Best link so far. Take it.
link = url_new; link = url_new;
uglyness = priority; quality = priority;
} }
} }
} }
@ -387,23 +395,21 @@ namespace ModIndexer
int checkLinkPattern(string url_raw, out string url_new) int checkLinkPattern(string url_raw, out string url_new)
{ {
const string github = "|/archive/*"; // also for notabug.org
const string gitlab = "|/repository/*";
const string bitbucket = "|/get/*|/downloads/*";
// Sort by priority for link quality, where 0 = highest
string[] patterns = { string[] patterns = {
// Similar formatted git // GitHub & Notabug
@"^(https?:/(/[\w_.-]*){3})(/?$|\.git$" + github + gitlab + bitbucket + ")", @"^(https?://(www\.)?(github\.com|notabug\.org)(/[\w_.-]*){2})(/?$|\.git$|/archive/*)",
// GitLab
@"^(https?://(www\.)?gitlab\.com(/[\w_.-]*){2})(/?$|\.git$|/repository/*)",
// BitBucket
@"^(https?://(www\.)?bitbucket.org(/[\w_.-]*){2})(/?$|\.git$|/get/*|/downloads/*)",
// repo.or.cz // repo.or.cz
@"^(https?://repo\.or\.cz/[\w_.-]*\.git)(/?$|/snapshot/*)", @"^(https?://repo\.or\.cz/[\w_.-]*\.git)(/?$|/snapshot/*)"
// Forum absolute link
@"^(https?://forum\.minetest\.net/download/file.php\?id=\d*)$"
}; };
// Convert attachment links to proper ones // Convert attachment links to proper ones
if (url_raw.StartsWith("./download/file.php?id=")) // Ignore forum attachments. They're evil and hard to check.
url_raw = url_raw.Replace(".", "https://forum.minetest.net"); //if (url_raw.StartsWith("./download/file.php?id="))
// url_raw = url_raw.Replace(".", "https://forum.minetest.net");
for (int p = 0; p < patterns.Length; p++) { for (int p = 0; p < patterns.Length; p++) {
var reg1 = new System.Text.RegularExpressions.Regex(patterns[p]); var reg1 = new System.Text.RegularExpressions.Regex(patterns[p]);
@ -414,10 +420,10 @@ namespace ModIndexer
// This one matches // This one matches
url_new = match.Groups[1].ToString(); url_new = match.Groups[1].ToString();
return p * 10; return 10;
} }
url_new = url_raw; url_new = url_raw;
return PRIORITY_WORST; // None matches return -10; // None matches
} }
bool isLinkAvailable(ref string url) bool isLinkAvailable(ref string url)