/*This script provides a list of postings and resources
 * that they link to. It can be used to find out:
 * 1. Postings that link to a particular resource
 * 2. Whether or not a resource has been linked to a posting
 * 3. The number of times a resource has been linked
 * The output is a CSV file containing 
 * the posting's path, 
 * the resource's path and 
 * the name of the placeholder.
 * Open the CSV file in Excel and use its sorting/filtering capabilities
 * to get the answer you need.
 * 
 * The programs are provided as is without any guarantees or warranty. 
 * Although the author has attempted to find and correct any bugs in the 
 * free software programs, the author is not responsible for any damage or 
 * losses of any kind caused by the use or misuse of the programs. 
 * The author is under no obligation to provide support, service, corrections, 
 * or upgrades to the free software programs. 
 * 
 * Written by: Lim Mei Ying (meiyinglim@hotmail.com)
 * Blog: http://meiyinglim.blogspot.com
 * */
using System;
using System.IO;
using System.Web;
using System.Data;
using System.Data.SqlClient;
using System.Text.RegularExpressions;
using Microsoft.ContentManagement.Publishing;
using Microsoft.ContentManagement.Publishing.Extensions.Placeholders;

namespace FindHyperLinks
{
	/// <summary>
	/// Summary description for Class1.
	/// </summary>
	class Find
	{
		/// <summary>
		/// The main entry point for the application.
		/// </summary>
		/// 		
		[STAThread]
		static void Main(string[] args)
		{
			//
			// TODO: Add code to start application here
			//
			CmsApplicationContext cmsContext;
			string filePath = "Links" + DateTime.Now.ToString("ddMMMyyyyhhmmss") + ".csv";

			// login using the current windows account in unpublished mode.
			cmsContext = new CmsApplicationContext();
			cmsContext.AuthenticateAsCurrentUser(PublishingMode.Unpublished);

			// get the root channel or use another channel as a start point.
			Channel root = cmsContext.RootChannel;

			// write the headers to the log file.
			WriteToLog("Page DisplayName,Page Name,Page Path,Placeholder Name,HyperLink DisplayName,HyperLink Name,HyperLink Path,HyperLink Type", filePath);
			
			// walk through the entire tree.
			WalkTree(root, filePath, cmsContext);					
		}

		// scans placeholders for hyperlinks
		private static void ScanPlaceholders(Posting p, string filePath, CmsApplicationContext cmsContext)
		{			
			foreach(Placeholder ph in p.Placeholders)
			{
				string html = "";
				if(ph is HtmlPlaceholder)
				{
					html = ((HtmlPlaceholder)ph).Html;
				}
				else if(ph is ImagePlaceholder)
				{
					html = "href=" + ((ImagePlaceholder)ph).Href;
				}				
				else if(ph is XmlPlaceholder)
				{
					html = ((XmlPlaceholder)ph).XmlAsString;
				}

				// find links to channelitems
				string pattern;				
				MatchCollection matches;											

				// find links to channels and postings
				pattern = @"href\s*=\s*\""*[^\"">]*";				
				matches = Regex.Matches(html, pattern, RegexOptions.IgnoreCase);

				if(matches.Count > 0)
				{
					foreach(Match match in matches)
					{
						string url = match.Value;

						// remove the href
						url = url.Replace("href","");

						// remove spaces
						url = url.Replace(" ","");

						// remove the equal sign
						url = url.Replace("=","");

						// remove double quotes
						url = url.Replace("\"","");			

						// remove querystrings, if any
						if(url.IndexOf("?")>0)
						{
							url = url.Substring(0, url.IndexOf("?")-1);
						}

						// remove bookmarks, if any
						if(url.IndexOf("#")>0)
						{
							url = url.Substring(0, url.IndexOf("#")-1);
						}

						// remove .htm
						url = url.Replace(".htm","");			
						
						// check to see if the URL is a channel or a posting
						ChannelItem ci = EnhancedGetByUrl(cmsContext, url);						
						if(ci!=null)
						{
							string type = "";
							if(ci is Channel)
							{
								type = "Channel";
							}
							else if(ci is Posting)
							{
								type = "Posting";
							}	

							string log = p.Parent.DisplayName.Replace(",","") + "," + p.Name + "," + p.Path + "," + ph.Name + "," + ci.DisplayName.Replace(",","") + "," + ci.Name + "," + ci.Path + "," + type;
							WriteToLog(log,filePath);
						}
					}
				}				
			}
		}

		#region Enhanced Searches.GetByURL() method
		// Taken from: http://support.microsoft.com/?id=887530
		// modified to work from a console app.
		private static bool MapChannelToHostHeaderEnabled(CmsContext ctx)
		{
			return (ctx.RootChannel.UrlModePublished == "http://Channels/");
		}

		private static ChannelItem EnhancedGetByUrl(CmsContext ctx, string Url)
		{
			if (MapChannelToHostHeaderEnabled(ctx))
			{
				string Path = HttpUtility.UrlDecode(Url);
				Path = Path.Replace("http://","/Channels/");
				if (Path.EndsWith(".htm"))
				{
					Path = Path.Substring(0,Path.Length-4);
				}
				if (Path.EndsWith("/"))
				{
					Path = Path.Substring(0,Path.Length-1);
				}
				try
				{
					return (ChannelItem)(ctx.Searches.GetByPath(Path));
				}
				catch
				{
					return null;
				}
			}
			else
				return ctx.Searches.GetByUrl(Url);
		}
		#endregion

		// writes a line of text to the log
		// alternatively, alter the code to write to a database table
		private static void WriteToLog(string message, string filePath)
		{						
			FileStream fs = File.Open(filePath, FileMode.Append);
			StreamWriter sw = new StreamWriter(fs);
			sw.WriteLine(message);
			sw.Flush();
			sw.Close();
			fs.Close();				
		}
		
		// recursive function that walks through the entire tree from the start point
		private static void WalkTree(Channel c, string filePath, CmsApplicationContext cmsContext)
		{			
			Console.WriteLine(c.Path);
			foreach(ChannelItem ci in c.AllChildren)
			{
				if(ci is Posting)
				{
					Console.WriteLine(ci.Path);
					Posting p = ci as Posting;
					if(p.IsRobotIndexable)
					{
						ScanPlaceholders(p, filePath, cmsContext);
					}
				}
				else if(ci is Channel)
				{			
					Channel cc = ci as Channel;
					if(cc.IsRobotFollowable)
					{
						WalkTree(cc, filePath, cmsContext);					
					}
				}
			}			
		}
	}
}
