Rewrite link trimming -> Regex. Check for working links
This commit is contained in:
parent
94e0c9b9ef
commit
05564ca400
@ -22,6 +22,7 @@
|
|||||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||||
<ErrorReport>prompt</ErrorReport>
|
<ErrorReport>prompt</ErrorReport>
|
||||||
<WarningLevel>4</WarningLevel>
|
<WarningLevel>4</WarningLevel>
|
||||||
|
<Externalconsole>true</Externalconsole>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
|
||||||
<PlatformTarget>x86</PlatformTarget>
|
<PlatformTarget>x86</PlatformTarget>
|
||||||
|
265
Program.cs
265
Program.cs
@ -85,7 +85,6 @@ namespace ModIndexer
|
|||||||
|
|
||||||
update_data = new List<ForumData>();
|
update_data = new List<ForumData>();
|
||||||
ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate;
|
ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate;
|
||||||
|
|
||||||
int begin, end;
|
int begin, end;
|
||||||
if (!int.TryParse(start, out begin))
|
if (!int.TryParse(start, out begin))
|
||||||
begin = 1;
|
begin = 1;
|
||||||
@ -101,10 +100,10 @@ namespace ModIndexer
|
|||||||
try {
|
try {
|
||||||
int[] topics = JsonConvert.DeserializeObject<int[]>(enc.GetString(answer));
|
int[] topics = JsonConvert.DeserializeObject<int[]>(enc.GetString(answer));
|
||||||
|
|
||||||
for (int i = 0; i < topics.Length; i++) {
|
foreach (ForumData d in update_data) {
|
||||||
foreach (ForumData d in update_data) {
|
for (int i = 0; i < topics.Length; i++) {
|
||||||
if (d.topicId == topics[i]) {
|
if (d.topicId == topics[i]) {
|
||||||
Console.WriteLine("\t" + d.title);
|
Console.WriteLine((d.type == 0 ? "RM\t" : "\t") + d.title);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -291,18 +290,20 @@ namespace ModIndexer
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == Misc.DATA_TYPE.INVALID) {
|
|
||||||
Console.WriteLine("INFO: Don't know where to put this mod:" +
|
|
||||||
"\n\t(ID) Title: ({0}) {1}", topicId, title);
|
|
||||||
goto flip;
|
|
||||||
}
|
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
string download = "";
|
ForumData info = new ForumData(
|
||||||
if (forum != Misc.FETCH_TYPE.OLD_MODS) {
|
topicId,
|
||||||
|
title.EscapeXML(),
|
||||||
|
(int)type,
|
||||||
|
authorId,
|
||||||
|
author,
|
||||||
|
"" // Yet unknown
|
||||||
|
);
|
||||||
|
|
||||||
|
if (type != Misc.DATA_TYPE.INVALID && forum != Misc.FETCH_TYPE.OLD_MODS) {
|
||||||
// Fetch topics, get download/source links
|
// Fetch topics, get download/source links
|
||||||
bool is_git = FetchSingleTopic(topicId, author, mod_name, ref download);
|
FetchSingleTopic(mod_name, ref info);
|
||||||
|
|
||||||
// TODO: Find an use for is_git
|
// TODO: Find an use for is_git
|
||||||
}
|
}
|
||||||
@ -312,14 +313,7 @@ namespace ModIndexer
|
|||||||
"\n\tLink: " + download +
|
"\n\tLink: " + download +
|
||||||
"\n\tType: " + (int)type + " " + type.ToString());*/
|
"\n\tType: " + (int)type + " " + type.ToString());*/
|
||||||
|
|
||||||
update_data.Add(new ForumData(
|
update_data.Add(info);
|
||||||
topicId,
|
|
||||||
title.EscapeXML(),
|
|
||||||
(int)type,
|
|
||||||
authorId,
|
|
||||||
author,
|
|
||||||
download
|
|
||||||
));
|
|
||||||
|
|
||||||
// Empty for next fetch
|
// Empty for next fetch
|
||||||
author = "";
|
author = "";
|
||||||
@ -331,135 +325,140 @@ namespace ModIndexer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Analyze topic contents and get link
|
const int PRIORITY_WORST = 0xFFFF;
|
||||||
bool FetchSingleTopic(int topicId, string author, string mod_name, ref string link)
|
|
||||||
{
|
|
||||||
Console.WriteLine("=== Topic " + topicId);
|
|
||||||
Thread.Sleep(200);
|
|
||||||
link = "";
|
|
||||||
|
|
||||||
HtmlNodeCollection bodyNode = OpenPage(
|
// Analyze topic contents and get link
|
||||||
"https://forum.minetest.net/viewtopic.php?t=" + topicId,
|
void FetchSingleTopic(string mod_name, ref ForumData info)
|
||||||
"//div[@class='content']");
|
{
|
||||||
|
Console.WriteLine("=== Topic " + info.topicId);
|
||||||
|
Thread.Sleep(200);
|
||||||
|
|
||||||
|
HtmlNodeCollection bodyNode =
|
||||||
|
OpenPage(
|
||||||
|
"https://forum.minetest.net/viewtopic.php?t=" + info.topicId,
|
||||||
|
"//div[@class='content']"
|
||||||
|
);
|
||||||
|
|
||||||
if (bodyNode == null)
|
if (bodyNode == null)
|
||||||
return false;
|
return;
|
||||||
|
|
||||||
HtmlNodeCollection content = bodyNode[0].SelectNodes(".//a[@class='postlink']");
|
HtmlNodeCollection content = bodyNode[0].SelectNodes(".//a[@class='postlink']");
|
||||||
|
|
||||||
if (content == null)
|
if (content == null) {
|
||||||
return false;
|
Console.WriteLine("\tNo download links embedded.");
|
||||||
|
// Topic is dead. Remove mod.
|
||||||
|
info.type = (int)Misc.DATA_TYPE.INVALID;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
string download = "", source = "";
|
string link = "";
|
||||||
int forum_download = 0;
|
int uglyness = PRIORITY_WORST;
|
||||||
|
|
||||||
foreach (HtmlNode dtNode in content) {
|
foreach (HtmlNode dtNode in content) {
|
||||||
string url = dtNode.GetAttributeValue("href", "");
|
string url_raw = dtNode.GetAttributeValue("href", "");
|
||||||
string text = dtNode.InnerText;
|
string text = dtNode.InnerText;
|
||||||
|
|
||||||
if (url.EndsWith(".git")) {
|
if (url_raw[url_raw.Length - 1] == '/')
|
||||||
source = url;
|
url_raw = url_raw.Remove(url_raw.Length - 1);
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (url[url.Length - 1] == '/') {
|
|
||||||
url = url.Remove(url.Length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (url.StartsWith("./download/file.php?id=")) {
|
string url_new;
|
||||||
int pos = 23;
|
int priority = checkLinkPattern(url_raw, out url_new);
|
||||||
int number = 0;
|
|
||||||
while (pos < url.Length) {
|
|
||||||
char cur = url[pos];
|
|
||||||
if (cur < 48 || cur > 57)
|
|
||||||
break;
|
|
||||||
|
|
||||||
number = number * 10 + (cur - 48);
|
string lower = url_new.ToLower().Replace('-', '_');
|
||||||
pos++;
|
if (lower.Contains(mod_name))
|
||||||
|
uglyness -= 3;
|
||||||
|
if (lower.Contains(info.userName.ToLower()))
|
||||||
|
uglyness--;
|
||||||
|
|
||||||
|
if (priority < uglyness) {
|
||||||
|
if (isLinkAvailable(ref url_new)) {
|
||||||
|
// Best link so far. Take it.
|
||||||
|
link = url_new;
|
||||||
|
uglyness = priority;
|
||||||
}
|
}
|
||||||
|
|
||||||
string text_lower = text.ToLower().Replace('-', '_');
|
|
||||||
if (text_lower.Contains(mod_name) &&
|
|
||||||
number > forum_download) {
|
|
||||||
|
|
||||||
forum_download = number;
|
|
||||||
}
|
|
||||||
} else if (url.Contains(".zip") ||
|
|
||||||
url.Contains("/zipball/") ||
|
|
||||||
url.Contains("/tarball/") ||
|
|
||||||
url.Contains("/archive/") ||
|
|
||||||
url.Contains("mediafire.com/")) {
|
|
||||||
// Direct download link
|
|
||||||
|
|
||||||
if (url.Contains("://ompldr.org"))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
bool contains_git = url.Contains("git");
|
|
||||||
if (contains_git) {
|
|
||||||
byte count = 0,
|
|
||||||
pos = 0;
|
|
||||||
for (byte i = 0; i < url.Length; i++) {
|
|
||||||
if (url[i] == '/') {
|
|
||||||
if (count == 4)
|
|
||||||
pos = i;
|
|
||||||
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (count == 6) {
|
|
||||||
source = url.Substring(0, pos);
|
|
||||||
|
|
||||||
// Try to find another link if it's not contained in the name
|
|
||||||
string src_lower = source.ToLower().Replace('-', '_');
|
|
||||||
if (src_lower.Contains(mod_name) ||
|
|
||||||
(src_lower.Contains(author.ToLower()) && source == ""))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (download == "")
|
|
||||||
download = url;
|
|
||||||
} else if (url.Contains("://github.com/")
|
|
||||||
|| url.Contains("://notabug.org/")
|
|
||||||
|| url.Contains("://bitbucket.org/")) {
|
|
||||||
if (url.Contains("/minetest/minetest") ||
|
|
||||||
url.Contains("/commits"))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
byte count = 0,
|
|
||||||
pos = 0;
|
|
||||||
for (byte i = 0; i < url.Length; i++) {
|
|
||||||
if (url[i] == '/') {
|
|
||||||
if (count == 4) {
|
|
||||||
// If it's too long, cut it off
|
|
||||||
pos = i;
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (count < 4 || count > 5)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// url.EndsWith("/tree") || url.EndsWith("/master")
|
|
||||||
// //github/user/proj/master
|
|
||||||
if (count == 5)
|
|
||||||
source = url.Substring(0, pos);
|
|
||||||
else
|
|
||||||
source = url;
|
|
||||||
|
|
||||||
string src_lower = source.ToLower().Replace('-', '_');
|
|
||||||
if (src_lower.Contains(mod_name) ||
|
|
||||||
(src_lower.Contains(author.ToLower()) && source == ""))
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (source == "" &&
|
|
||||||
download == "" &&
|
|
||||||
forum_download == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
link = source != "" ? source : download;
|
// Can't be worse than empty
|
||||||
|
info.link = link;
|
||||||
|
}
|
||||||
|
|
||||||
if (link == "" && forum_download > 0)
|
int checkLinkPattern(string url_raw, out string url_new)
|
||||||
link = "https://forum.minetest.net/download/file.php?id=" + forum_download;
|
{
|
||||||
return source != "";
|
const string github = "|/archive/*"; // also for notabug.org
|
||||||
|
const string gitlab = "|/repository/*";
|
||||||
|
const string bitbucket = "|/get/*|/downloads/*";
|
||||||
|
|
||||||
|
// Sort by priority for link quality, where 0 = highest
|
||||||
|
string[] patterns = {
|
||||||
|
// Similar formatted git
|
||||||
|
@"^(https?:/(/[\w_.-]*){3})(/?$|\.git$" + github + gitlab + bitbucket + ")",
|
||||||
|
// repo.or.cz
|
||||||
|
@"^(https?://repo\.or\.cz/[\w_.-]*\.git)(/?$|/snapshot/*)",
|
||||||
|
// Forum absolute link
|
||||||
|
@"^(https?://forum\.minetest\.net/download/file.php\?id=\d*)$"
|
||||||
|
};
|
||||||
|
|
||||||
|
// Convert attachment links to proper ones
|
||||||
|
if (url_raw.StartsWith("./download/file.php?id="))
|
||||||
|
url_raw = url_raw.Replace(".", "https://forum.minetest.net");
|
||||||
|
|
||||||
|
for (int p = 0; p < patterns.Length; p++) {
|
||||||
|
var reg1 = new System.Text.RegularExpressions.Regex(patterns[p]);
|
||||||
|
var match = reg1.Match(url_raw);
|
||||||
|
|
||||||
|
if (match.Value == "")
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// This one matches
|
||||||
|
url_new = match.Groups[1].ToString();
|
||||||
|
return p * 10;
|
||||||
|
}
|
||||||
|
url_new = url_raw;
|
||||||
|
return PRIORITY_WORST; // None matches
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isLinkAvailable(ref string url)
|
||||||
|
{
|
||||||
|
// We need TLS 1.2 for GitHub but Mono doesn't support that yet
|
||||||
|
// Use curl instead and hope the servers support HEAD requests
|
||||||
|
|
||||||
|
var proc_info = new System.Diagnostics.ProcessStartInfo();
|
||||||
|
proc_info.FileName = "curl";
|
||||||
|
proc_info.Arguments = "-L -I " + url;
|
||||||
|
proc_info.UseShellExecute = false;
|
||||||
|
proc_info.RedirectStandardOutput = true;
|
||||||
|
proc_info.RedirectStandardError = true;
|
||||||
|
var curl = System.Diagnostics.Process.Start(proc_info);
|
||||||
|
Thread.Sleep(100);
|
||||||
|
|
||||||
|
bool was_empty = true;
|
||||||
|
int status = 404;
|
||||||
|
|
||||||
|
while (!curl.StandardOutput.EndOfStream) {
|
||||||
|
string line = curl.StandardOutput.ReadLine();
|
||||||
|
// Get the line after the last blank one
|
||||||
|
if (line == "") {
|
||||||
|
was_empty = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!was_empty) {
|
||||||
|
// Look out for "Location: " and get the correct URL
|
||||||
|
if (line.StartsWith("Location: "))
|
||||||
|
url = line.Substring(10);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
was_empty = false;
|
||||||
|
string[] parts = line.Split(' ');
|
||||||
|
if (parts.Length < 3)
|
||||||
|
continue; // This should not happen
|
||||||
|
|
||||||
|
int.TryParse(parts[1], out status);
|
||||||
|
}
|
||||||
|
|
||||||
|
return status == 200;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove useless tags from the forum titles
|
// Remove useless tags from the forum titles
|
||||||
|
Loading…
x
Reference in New Issue
Block a user