Rewrite link trimming -> Regex. Check for working links

This commit is contained in:
SmallJoker 2018-04-01 19:58:45 +02:00
parent 94e0c9b9ef
commit 05564ca400
2 changed files with 133 additions and 133 deletions

View File

@ -22,6 +22,7 @@
<DefineConstants>DEBUG;TRACE</DefineConstants> <DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport> <ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel> <WarningLevel>4</WarningLevel>
<Externalconsole>true</Externalconsole>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' "> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
<PlatformTarget>x86</PlatformTarget> <PlatformTarget>x86</PlatformTarget>

View File

@ -85,7 +85,6 @@ namespace ModIndexer
update_data = new List<ForumData>(); update_data = new List<ForumData>();
ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate; ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate;
int begin, end; int begin, end;
if (!int.TryParse(start, out begin)) if (!int.TryParse(start, out begin))
begin = 1; begin = 1;
@ -101,10 +100,10 @@ namespace ModIndexer
try { try {
int[] topics = JsonConvert.DeserializeObject<int[]>(enc.GetString(answer)); int[] topics = JsonConvert.DeserializeObject<int[]>(enc.GetString(answer));
for (int i = 0; i < topics.Length; i++) {
foreach (ForumData d in update_data) { foreach (ForumData d in update_data) {
for (int i = 0; i < topics.Length; i++) {
if (d.topicId == topics[i]) { if (d.topicId == topics[i]) {
Console.WriteLine("\t" + d.title); Console.WriteLine((d.type == 0 ? "RM\t" : "\t") + d.title);
break; break;
} }
} }
@ -291,18 +290,20 @@ namespace ModIndexer
} }
break; break;
} }
if (type == Misc.DATA_TYPE.INVALID) {
Console.WriteLine("INFO: Don't know where to put this mod:" +
"\n\t(ID) Title: ({0}) {1}", topicId, title);
goto flip;
}
#endregion #endregion
string download = ""; ForumData info = new ForumData(
if (forum != Misc.FETCH_TYPE.OLD_MODS) { topicId,
title.EscapeXML(),
(int)type,
authorId,
author,
"" // Yet unknown
);
if (type != Misc.DATA_TYPE.INVALID && forum != Misc.FETCH_TYPE.OLD_MODS) {
// Fetch topics, get download/source links // Fetch topics, get download/source links
bool is_git = FetchSingleTopic(topicId, author, mod_name, ref download); FetchSingleTopic(mod_name, ref info);
// TODO: Find an use for is_git // TODO: Find an use for is_git
} }
@ -312,14 +313,7 @@ namespace ModIndexer
"\n\tLink: " + download + "\n\tLink: " + download +
"\n\tType: " + (int)type + " " + type.ToString());*/ "\n\tType: " + (int)type + " " + type.ToString());*/
update_data.Add(new ForumData( update_data.Add(info);
topicId,
title.EscapeXML(),
(int)type,
authorId,
author,
download
));
// Empty for next fetch // Empty for next fetch
author = ""; author = "";
@ -331,135 +325,140 @@ namespace ModIndexer
} }
} }
// Analyze topic contents and get link const int PRIORITY_WORST = 0xFFFF;
bool FetchSingleTopic(int topicId, string author, string mod_name, ref string link)
{
Console.WriteLine("=== Topic " + topicId);
Thread.Sleep(200);
link = "";
HtmlNodeCollection bodyNode = OpenPage( // Analyze topic contents and get link
"https://forum.minetest.net/viewtopic.php?t=" + topicId, void FetchSingleTopic(string mod_name, ref ForumData info)
"//div[@class='content']"); {
Console.WriteLine("=== Topic " + info.topicId);
Thread.Sleep(200);
HtmlNodeCollection bodyNode =
OpenPage(
"https://forum.minetest.net/viewtopic.php?t=" + info.topicId,
"//div[@class='content']"
);
if (bodyNode == null) if (bodyNode == null)
return false; return;
HtmlNodeCollection content = bodyNode[0].SelectNodes(".//a[@class='postlink']"); HtmlNodeCollection content = bodyNode[0].SelectNodes(".//a[@class='postlink']");
if (content == null) if (content == null) {
return false; Console.WriteLine("\tNo download links embedded.");
// Topic is dead. Remove mod.
info.type = (int)Misc.DATA_TYPE.INVALID;
return;
}
string download = "", source = ""; string link = "";
int forum_download = 0; int uglyness = PRIORITY_WORST;
foreach (HtmlNode dtNode in content) { foreach (HtmlNode dtNode in content) {
string url = dtNode.GetAttributeValue("href", ""); string url_raw = dtNode.GetAttributeValue("href", "");
string text = dtNode.InnerText; string text = dtNode.InnerText;
if (url.EndsWith(".git")) { if (url_raw[url_raw.Length - 1] == '/')
source = url; url_raw = url_raw.Remove(url_raw.Length - 1);
continue;
string url_new;
int priority = checkLinkPattern(url_raw, out url_new);
string lower = url_new.ToLower().Replace('-', '_');
if (lower.Contains(mod_name))
uglyness -= 3;
if (lower.Contains(info.userName.ToLower()))
uglyness--;
if (priority < uglyness) {
if (isLinkAvailable(ref url_new)) {
// Best link so far. Take it.
link = url_new;
uglyness = priority;
}
} }
if (url[url.Length - 1] == '/') {
url = url.Remove(url.Length - 1);
} }
if (url.StartsWith("./download/file.php?id=")) { // Can't be worse than empty
int pos = 23; info.link = link;
int number = 0;
while (pos < url.Length) {
char cur = url[pos];
if (cur < 48 || cur > 57)
break;
number = number * 10 + (cur - 48);
pos++;
} }
string text_lower = text.ToLower().Replace('-', '_'); int checkLinkPattern(string url_raw, out string url_new)
if (text_lower.Contains(mod_name) && {
number > forum_download) { const string github = "|/archive/*"; // also for notabug.org
const string gitlab = "|/repository/*";
const string bitbucket = "|/get/*|/downloads/*";
forum_download = number; // Sort by priority for link quality, where 0 = highest
} string[] patterns = {
} else if (url.Contains(".zip") || // Similar formatted git
url.Contains("/zipball/") || @"^(https?:/(/[\w_.-]*){3})(/?$|\.git$" + github + gitlab + bitbucket + ")",
url.Contains("/tarball/") || // repo.or.cz
url.Contains("/archive/") || @"^(https?://repo\.or\.cz/[\w_.-]*\.git)(/?$|/snapshot/*)",
url.Contains("mediafire.com/")) { // Forum absolute link
// Direct download link @"^(https?://forum\.minetest\.net/download/file.php\?id=\d*)$"
};
if (url.Contains("://ompldr.org")) // Convert attachment links to proper ones
if (url_raw.StartsWith("./download/file.php?id="))
url_raw = url_raw.Replace(".", "https://forum.minetest.net");
for (int p = 0; p < patterns.Length; p++) {
var reg1 = new System.Text.RegularExpressions.Regex(patterns[p]);
var match = reg1.Match(url_raw);
if (match.Value == "")
continue; continue;
bool contains_git = url.Contains("git"); // This one matches
if (contains_git) { url_new = match.Groups[1].ToString();
byte count = 0, return p * 10;
pos = 0; }
for (byte i = 0; i < url.Length; i++) { url_new = url_raw;
if (url[i] == '/') { return PRIORITY_WORST; // None matches
if (count == 4) }
pos = i;
count++; bool isLinkAvailable(ref string url)
} {
} // We need TLS 1.2 for GitHub but Mono doesn't support that yet
if (count == 6) { // Use curl instead and hope the servers support HEAD requests
source = url.Substring(0, pos);
// Try to find another link if it's not contained in the name var proc_info = new System.Diagnostics.ProcessStartInfo();
string src_lower = source.ToLower().Replace('-', '_'); proc_info.FileName = "curl";
if (src_lower.Contains(mod_name) || proc_info.Arguments = "-L -I " + url;
(src_lower.Contains(author.ToLower()) && source == "")) proc_info.UseShellExecute = false;
break; proc_info.RedirectStandardOutput = true;
} proc_info.RedirectStandardError = true;
} var curl = System.Diagnostics.Process.Start(proc_info);
if (download == "") Thread.Sleep(100);
download = url;
} else if (url.Contains("://github.com/") bool was_empty = true;
|| url.Contains("://notabug.org/") int status = 404;
|| url.Contains("://bitbucket.org/")) {
if (url.Contains("/minetest/minetest") || while (!curl.StandardOutput.EndOfStream) {
url.Contains("/commits")) string line = curl.StandardOutput.ReadLine();
// Get the line after the last blank one
if (line == "") {
was_empty = true;
continue; continue;
}
byte count = 0, if (!was_empty) {
pos = 0; // Look out for "Location: " and get the correct URL
for (byte i = 0; i < url.Length; i++) { if (line.StartsWith("Location: "))
if (url[i] == '/') { url = line.Substring(10);
if (count == 4) {
// If it's too long, cut it off
pos = i;
}
count++;
}
}
if (count < 4 || count > 5)
continue; continue;
// url.EndsWith("/tree") || url.EndsWith("/master")
// //github/user/proj/master
if (count == 5)
source = url.Substring(0, pos);
else
source = url;
string src_lower = source.ToLower().Replace('-', '_');
if (src_lower.Contains(mod_name) ||
(src_lower.Contains(author.ToLower()) && source == ""))
break;
} }
was_empty = false;
string[] parts = line.Split(' ');
if (parts.Length < 3)
continue; // This should not happen
int.TryParse(parts[1], out status);
} }
if (source == "" &&
download == "" &&
forum_download == 0)
return false;
link = source != "" ? source : download; return status == 200;
if (link == "" && forum_download > 0)
link = "https://forum.minetest.net/download/file.php?id=" + forum_download;
return source != "";
} }
// Remove useless tags from the forum titles // Remove useless tags from the forum titles