using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; // For Porter stemmer, use http://tartarus.org/~martin/PorterStemmer/csharp3.txt // Licensing: http://tartarus.org/~martin/PorterStemmer/ "all these encodings of the algorithm can be used free of charge for any purpose" using Poseidon.Analysis; // The Porter Stemmer implementation namespace namespace Terminology { class Program { static char[] cSplitters = {' ', '\r', '\n', '\t', '\\', '/', ':', ';', '.', ',', '\'', '`', '"', ')', '(', '{', '}', '[', ']', '!', '?', '-', '–', '>', '<', '~', '=', '°', '#', '%', '@', '$', '^', '&', '*', '+', '_', '|', '±', '’', '≈', '—'}; // Default stop words list is from https://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list#English // Plus some words specific in the scientific context: us, can, new, since, et, al, el, fig, will, whose, and their suffixes like re, ve, ll static List mStopWords = null; const int nMinLenToAccept = 2; private static List FileToRawBag(string sFileName) { string sAll = File.ReadAllText(sFileName); string[] sTokens = sAll.Split(cSplitters); List Ret = new List(); foreach (string t in sTokens) { string tlow = t.ToLower(); if (String.IsNullOrEmpty(tlow)) continue; if (tlow.Length < nMinLenToAccept) continue; if (IsAllDigits(tlow)) continue; if (mStopWords.Contains(tlow)) continue; Ret.Add(tlow); } return Ret; } private static bool IsAllDigits(string s) { foreach (char c in s) { if (!Char.IsDigit(c)) return false; } return true; } private static double ComputeOverlapSimilarity(Dictionary dA, Dictionary dB, Dictionary idf, out double RelSupport) { List Intersect = dA.Keys.Intersect(dB.Keys).ToList(); RelSupport = ((double) Intersect.Count)/Math.Min(dA.Values.Sum(), dB.Values.Sum()); double ScalarProduct = 0; foreach (string s in Intersect) { ScalarProduct += dA[s]*dB[s]*idf[s]*idf[s]; } double NormA = 0; foreach (string s in dA.Keys) NormA += dA[s]*dA[s]*idf[s]*idf[s]; NormA = Math.Sqrt(NormA); double NormB = 0; foreach (string s in dB.Keys) NormB += dB[s]*dB[s]*idf[s]*idf[s]; NormB = Math.Sqrt(NormB); return ScalarProduct/(NormA*NormB); } private static void CompareTexts(string sInDataRoot) { Console.WriteLine("Processing \"{0}\"", sInDataRoot); // For simple testing runs, the expected classes may be provided. If they are, let's read them for further comparison Dictionary Expectations = null; if (File.Exists(sInDataRoot + @"\Expected.txt")) { Expectations = new Dictionary(); string[] L = File.ReadAllLines(sInDataRoot + @"\Expected.txt"); foreach (string s in L) { string[] T = s.Split('\t'); Expectations.Add(T[0], T[1]); } } mStopWords = File.ReadAllText(sInDataRoot + "StopWords.txt").Split(',').ToList(); // For Porter stemmer, use http://tartarus.org/~martin/PorterStemmer/csharp3.txt // Licensing: http://tartarus.org/~martin/PorterStemmer/ "all these encodings of the algorithm can be used free of charge for any purpose" PorterStemmer PS = new PorterStemmer(); Dictionary idf = new Dictionary(); Dictionary> FilteredBags = new Dictionary>(); DirectoryInfo diRoot = new DirectoryInfo(sInDataRoot); DirectoryInfo[] DI = diRoot.GetDirectories(); int nDocs = 0; foreach (DirectoryInfo di in DI) { string sKey = di.Name; FilteredBags.Add(sKey, new Dictionary()); FileInfo[] FI = di.GetFiles("*.txt"); foreach (FileInfo fi in FI) { List b = FileToRawBag(fi.FullName); nDocs++; List f = new List(); // We'll need a renewable list per document here for IDF calc -- so read to the list first, then later update the dictionary foreach (string s in b) { // For Porter stemmer, use http://tartarus.org/~martin/PorterStemmer/csharp3.txt // Licensing: http://tartarus.org/~martin/PorterStemmer/ "all these encodings of the algorithm can be used free of charge for any purpose" string t = PS.StemWord(s); if (String.IsNullOrEmpty(t)) continue; if (t.Length < nMinLenToAccept) continue; f.Add(t); } // Update IDF list -- for now, raw counts only List d = f.Distinct().ToList(); foreach (string s in d) { if (idf.ContainsKey(s)) idf[s]++; else idf.Add(s, 1); } // Now update the words bag dictionary foreach (string s in f) { if (FilteredBags[sKey].ContainsKey(s)) FilteredBags[sKey][s]++; else FilteredBags[sKey].Add(s, 1); } } } // Inverse the IDF counts as needed, add weights List idfKeys = idf.Keys.ToList(); foreach (string s in idfKeys) idf[s] = Math.Log(nDocs/idf[s])*(1.0 + Math.Log(s.Length)); // Now the actual computation int N = FilteredBags.Keys.Count; double[,] J = new double[N,N]; // This will store our distance matrix double[,] R = new double[N, N]; // The support matrix -- for tracking only List BagKeys = FilteredBags.Keys.ToList(); int nWrongClasses = 0; double Quality = 1; for (int i = 0; i < N; i++) { double MaxForeignJ = double.MinValue; double NextForeignJ = double.MinValue; string Class = null; for (int j = 0; j < N; j++) { double RelativeSupport = double.NaN; J[i,j] = ComputeOverlapSimilarity(FilteredBags[BagKeys[i]], FilteredBags[BagKeys[j]], idf, out RelativeSupport); R[i, j] = RelativeSupport; // This if-else block track the quality of distinction between the "best match" class and the nextg best one. We use it to quantify method efficiency if (i != j) { if (J[i,j] > MaxForeignJ) { NextForeignJ = MaxForeignJ; MaxForeignJ = J[i, j]; Class = BagKeys[j]; } else if (J[i, j] > NextForeignJ) { NextForeignJ = J[i, j]; } } } if ((null !=Expectations)&&(Class != Expectations[BagKeys[i]])) { nWrongClasses++; Console.WriteLine("Wrong classification for {0}: expected {1}, classified as {2}", BagKeys[i], Expectations[BagKeys[i]], Class); } Quality *= MaxForeignJ/NextForeignJ; } if (null != Expectations) Console.WriteLine("Total misclassifications: {0}", nWrongClasses); // Let's output the resuts now, bot to the screen and to the file StringBuilder sb = new StringBuilder(); for (int i = 0; i < N; i++) { sb.Append(","); sb.Append(BagKeys[i]); } for (int i = 0; i < N; i++) { Console.WriteLine(); sb.Append(Environment.NewLine); sb.Append(BagKeys[i]); for (int j = 0; j < N; j++) { Console.WriteLine("{0} to {1}: {2}% (support: {3})", BagKeys[i], BagKeys[j], (100.0*J[i, j]).ToString("G3"), R[i, j].ToString("G3")); sb.Append(String.Format(",{0}% ({1})", (100.0*J[i, j]).ToString("G3"), R[i,j].ToString("G3"))); } } Console.WriteLine("Classification quality: {0}", Math.Pow(Quality, 1.0/FilteredBags.Keys.Count)); string[] dirT = diRoot.FullName.Split('\\'); string sOutFileName = dirT[dirT.Length-2] + ".Results.csv"; // A quick hack to produce an output file name. This would fail if the data is in the root of a drive, e.g. at D:\ if (File.Exists(sOutFileName)) File.Delete(sOutFileName); File.WriteAllText(sOutFileName, sb.ToString()); Console.WriteLine(); } static void Main(string[] args) { CompareTexts(@"..\..\..\WorkSet4-GasesVsMusic\"); CompareTexts(@"..\..\..\WorkSet5-LPSCWorks\"); CompareTexts(@"..\..\..\WorkSet7-Wikipedia\"); Console.ReadLine(); } } }