User:Tom.Bot/Task3 code

Source

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
	// global switches //////////////////////////////////////////////////////////
	
	bool SaveSkipSummaries = false;
	bool SkipPagesLargerThanLimit = false; // used with int Limit
	bool ManuallyCheckPagesWithoutAGoodInfobox = false; // usually it's an {{infobox person}} or {{infobox scientist}}
	bool ManuallyPlaceTaxonbarAtEndOfPage = false; // aid for pages w/o a {{DEFAULTSORT}} nor cats; manual only
	bool LiveDebug = false;
	bool SandboxDebug = false; // auto-detects
	Skip = false;
	
	
	// global-use vars //////////////////////////////////////////////////////////
	
	int Limit = 2500; // characters/bytes on a page; used with bool SkipPagesLargerThanLimit
	Summary = "";
	
	
	// preliminary exceptions/error checking ////////////////////////////////////
	
	if (ArticleTitle == "User:Tom.Reding/sandbox") SandboxDebug = true;
	
	if (SkipPagesLargerThanLimit)
	{
		string TooBig_Regex = @"^[\d\D]{" + (Limit + 1) + "}";
		bool TooBig = Regex.IsMatch(ArticleText, TooBig_Regex);
		if (TooBig)
		{
			Summary += "Too big (>" + Limit + "B). ";
			Skip = true;
		}
	}
	
	// check for inappropriate infoboxes
	string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
	string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
	bool BadInfobox1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
	bool BadInfobox2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
	if (BadInfobox1 || BadInfobox2)
	{
		Summary += @"Person/scientist infobox found. ";
		Skip = true;
	}
	
	// check for appropriate infoboxes
	string TitleTemplates_Regex = @"\{\{\s*(?:DISPLAY ?TITLE|[Ii]talicisedtitle|[Ii]talicised[ _]+title|[Ii]talicizedtitle|[Ii]talicized[ _]+title|[Ii]talicizetitle|[Ii]talicize[ _]+title|[Ii]talicstitle|[Ii]talics[ _]+title|[Ii]talics|ITALICTITLE|[Ii]talictitle|[Ii]talic[ _]+title[ _]+infobox|[Ii]talic[ _]+title|[Ii]talic|[Ii]tal|[Rr]edirect[ _]+italic[ _]+title|[Tt]itle[ _]+italic)";
	
	string TaxoTemplates_Regex = @"\{\{\s*(?:Template:\s*|Wikipedia:\s*)?(?:Infobox[ _]+)?(" + // prefixes
										@"Taxobox|Taxo|TX|Species ?box|Subspeciesbox|Infraspeciesbox|Subspeciesbox/ICN|" + // taxo/species
										@"Automatic[ _]+t?axobox|" + // auto
										@"bacteria|microorganism|virus" + // other
										@")(?=\s*(?:\||\<\!\-\-|" + TitleTemplates_Regex + @"|(?<=Automatic[ _]+t?axobox\s*)\}\}))"; // suffixes
	bool NoTaxoTemplates = !Regex.IsMatch(ArticleText, TaxoTemplates_Regex, RegexOptions.IgnoreCase);
	if (NoTaxoTemplates)
	{
		if (ManuallyCheckPagesWithoutAGoodInfobox)
		{
			if (!BadInfobox1 && !BadInfobox2)
			{
				// OK to proceed (manually)
			}
			else
			{
				// Skip is already true from 'inappropriate infoboxes' check
			}
		}
		else
		{
			Summary += @"No auto/taxo/speciesbox found. ";
			Skip = true;
		}
	}
	
	// check for {{Taxonbar
	string TaxonbarAliases_Regex = @"\{\{\s*(?:[Tt]axobar|[Tt]axon\-bar|[Tt]axonbar|[Tt]axonBar|[Tt]axonIds|[Tt]axon[ _]+bar)"; // 0 grps
	bool HasTaxonbar = Regex.IsMatch(ArticleText, TaxonbarAliases_Regex, RegexOptions.IgnoreCase);
	if (HasTaxonbar)
	{
		Summary += @"Taxonbar exists. ";
		Skip = true;
	}
	
	// get wikibase_item via WP API
	// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
	// wish I could find a URL_Encode function that worked....
	string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
	string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" + 
						ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
	string HTML1 = "";
	if (!Skip && !SandboxDebug)
	{
		try
		{
			HTML1 = Tools.GetHTML(URL1);
		}
		catch
		{
			Summary = "GetHTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + " . ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	
	// html1 error checks ///////////////////////////////////////////////////////
	
	string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
	if (string.IsNullOrEmpty(QID) && !Skip && !SandboxDebug)
	{
		Summary = @"QID retrieval failed. ";
		Skip = true;
	}
	
	if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip && !SandboxDebug) // case sensitive, jtbs
	{
		Summary = @"Unexpected QID format. ";
		Skip = true;
	}
	
	
	// determine quantity & quality of WD properties used ///////////////////////
	
	List<string> GoodPropertyList = new List<string>(new string[] {
		// alphabetically from [[Template:Taxonbar#Taxon identifiers]]:
		"P4024",
		"P2036",
		"P1348",
		"P3594",
		"P2833",
		"P2026",
		"P2946",
		"P3398",
		"P838",
		"P687",
		"P2464",
		"P3060",
		"P1940",
		"P3444",
//		"P830",	// ignore: EOL, Encyclopedia of Life
		"P1895",
		"P938",
		"P3101",
		"P1727",
		"P3100",
		"P1747",
		"P842",
//		"P846",	// ignore: GBIF, Global Biodiversity Information Facility
		"P1832",
		"P1421",
		"P3099",
		"P1076",
		"P3151",
		"P1391",
		"P961",
		"P586",
		"P815",
		"P627",
		"P3064",
		"P1991",
		"P959",
		"P962",
		"P685",
		"P4122",
		"P2434",
		"P3102",
//		"P1070",	// ignore: TPL, The Plant List
		"P1772",
		"P1992",
		"P2040",
		"P2455",
		"P960",
		"P1745",
		"P1761",
		"P3591",
		"P850",
		"P3288",
		"P2426",
		"P1746"
	}); // ignores don't count towards the total property count, per [[WT:TREE#Taxonbar addition requirements]]
	
	List<string> BadPropertyList = new List<string>(new string[] {
		"P830",	// ignore: EOL, Encyclopedia of Life
		"P846",	// ignore: GBIF, Global Biodiversity Information Facility
		"P1070",	// ignore: TPL, The Plant List
		
		// remaining 13 uniques from [[d:Wikidata:WikiProject Taxonomy#Databases]]:
		// [[Module:Taxonbar/conf]] needs updating (follow up after bulk run)
		"P1939",
		"P2752",
		"P2794",
		"P3088",
		"P3186",
		"P3322",
		"P3420",
		"P3606",
		"P4125",
		"P4194",
		"P4301",
		"P4311",
		"P4526"
	});
	
	// get Wikidata
	// ex: https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=Q36557
	string URL2 = @"https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=" + QID;
	string HTML2 = "";
	if (!Skip && !SandboxDebug)
	{
		try
		{
			HTML2 = Tools.GetHTML(URL2);
		}
		catch
		{
			Summary = "GetHTML2 failed. URL2 = " + URL2 + " . ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	// scrape Wikidata
	// example text surrounding a populated property:
	//        "P959": [
	//            {
	//                "mainsnak": {
	//                    "snaktype": "value",
	//                    "property": "P959",
	//                    "hash": "c18d910a13321717e90ba037d26f1f1b86558128",
	//                    "datavalue": {
	//                        "value": "11500009",
	//                        "type": "string"
	//                    },
	//                    "datatype": "external-id"
	//                },
	int iGoodProps = 0;
	int iBadProps = 0;
	if (!Skip && !SandboxDebug)
	{
		foreach (string p in GoodPropertyList)
		{
			string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
			bool Found = Regex.IsMatch(HTML2, p_regex);
			if (Found) iGoodProps++;
		}
		
		foreach (string p in BadPropertyList)
		{
			string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
			bool Found = Regex.IsMatch(HTML2, p_regex);
			if (Found) iBadProps++;
		}
		
		if (iGoodProps == 0)
		{
			if (iBadProps > 0) Summary += "No good PIDs found. ";
			else Summary += "No PIDs found. ";
			Skip = true;
		}
	}
	
	
	// main /////////////////////////////////////////////////////////////////////
	
	if (!Skip)
	{
		if (SandboxDebug)
		{
			iGoodProps = 1;
			QID = "1";
		}
		
		// move {{-stub}} tag closer to end of page, otherwise GenFixes adds an extra line before {{Taxonbar}} that can't be fixed w/o a reparse ([[Smythea]])
		// leading "\s*" & "\n" for cases like "{{reflist}}{{Malvales-stub}}" ([[Herrania mariae]])
		string MoveStubAfterCat_Regex = @"\s*(\{\{[^\{\}]*[ -]stub\s*\}\})\s*(\[\[\s*Category[^\[\]]+\]\])";
		ArticleText = Regex.Replace(ArticleText, MoveStubAfterCat_Regex, "\n" + @"$2" + "\n" + @"$1", RegexOptions.IgnoreCase);
		
		string Plural = (iGoodProps > 1) ? "s" : "";
		string TaxonbarComplete = @"{{Taxonbar|from=" + QID + @"}}";
		string AddBeforeCats_Regex = @"(^[\d\D]+?)(?=[\r\n]+[ 	]*(?:\{\{\s*Default ?sort|\[\[\s*Category))"; // better results than adding after last cat ([[Hellolycaena]])
		string SuccessSummary = @"+{{[[Template:Taxonbar|Taxonbar]]|" + 
										@"[[:Category:Taxonbar templates without from parameter|from]]=" + 
										@"[[d:Special:EntityPage/" + QID + @"|" + QID + @"]]}} " + 
										@"([[WT:TREE#Taxonbar addition requirements|" + iGoodProps + @" sig. taxon ID" + Plural + @"]]); " +
										@"[[WP:GenFixes]] on,";
		bool NoCat = !Regex.IsMatch(ArticleText, AddBeforeCats_Regex, RegexOptions.IgnoreCase);
		if (NoCat)
		{
			if (ManuallyPlaceTaxonbarAtEndOfPage)
			{
				ArticleText += "\n" + TaxonbarComplete;
				Summary = SuccessSummary + " (uncategorized page) ";
			}
			else
			{
				Summary += @"No cats/defaultsort to anchor {{Taxonbar}} around. Batch manually/code later. ";
				Skip = true;
			}
		}
		else
		{
			ArticleText = Regex.Replace(ArticleText, AddBeforeCats_Regex, @"$1" + "\n" + TaxonbarComplete, RegexOptions.IgnoreCase);
			Summary = SuccessSummary;
		}
	}
	
	
	// exception tracking ///////////////////////////////////////////////////////
	
	if (Skip && SaveSkipSummaries && !SandboxDebug)
	{
		string Message = ArticleTitle + "\t" + Summary + "\n";
		string File = @"Module output - Add {{Taxonbar+from}} (skip summaries).txt";
		string Path = @"F:\"; // desktop
		string FullPath = Path + File;
		const bool APPEND = true;
		Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
	}
	
	if (LiveDebug || SandboxDebug) Skip = false;
	
	return ArticleText;
}

Content Disclaimer

Informasi ini disarikan dari Wikipedia dan disajikan kembali untuk tujuan edukasi. Konten tersedia di bawah lisensi CC BY-SA 3.0. Kami tidak bertanggung jawab atas ketidakakuratan data yang bersumber dari kontribusi publik tersebut.

  1. The information displayed on this website is sourced in part or in whole from Wikipedia and has been adapted for the purpose of restating it. We strive to provide accurate and relevant information, however:
  2. There is no guarantee of absolute accuracy. Wikipedia is an open, collaborative project that can be edited by anyone, so information is subject to change.
  3. It is not intended to constitute professional advice. The content displayed is for informational and educational purposes only. For important decisions (e.g., medical, legal, or financial), please consult a professional.
  4. Content copyright. Wikipedia is licensed under the Creative Commons Attribution-ShareAlike License (CC BY-SA). This means that content may be reused with appropriate attribution and shared under a similar license.
  5. Responsible use. Any risk arising from the use of information from this website is entirely the responsibility of the user.