Good afternoon (morning, evening, night)! I had one small problem: there is a page with a table (specifically this one ) and I have to turn it into datatable, which I did, but the html table contains merged rows (rowspan), which shifts the following elements after rowspan lines:
GetWebPageCode html = new GetWebPageCode(); public MainWindow() { InitializeComponent(); var data = html.GetDataTable(html.GetCodeAsString("http://www.gks.ru/metod/XML/XML_plan_2017%20.htm")); } class.cs:
class GetWebPageCode { public bool RemoveComment { get; set; } public string SplitTag { get; set; } /// <summary> /// Получает исходный код страницы <see cref="urlAddress"/>, автоматически убирает комментарии и оставляет только таблицу /// </summary> /// <param name="urlAddress">http-страницы, которой надо получить исходный код</param> /// <returns></returns> public string GetCodeAsString(string urlAddress) { string tag = "table"; string data; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { Stream receiveStream = response.GetResponseStream(); StreamReader readStream = null; if (response.CharacterSet == null) { readStream = new StreamReader(receiveStream); } else { readStream = new StreamReader(receiveStream, Encoding.Default); } data = OnlyTable(RemoveHTMLComments(readStream.ReadToEnd()),tag); response.Close(); readStream.Close(); } else { data = response.StatusCode.ToString(); } return data; } /// <summary> /// Формирует DataTable из предоставленного кода /// </summary> /// <param name="input">Код должен состоять только из кода таблицы (между тэгами table)</param> /// <returns></returns> public DataTable GetDataTable(string input) { string[] str = Regex.Matches(input, @"\<tr.*?\</tr>").Cast<Match>().Select(m => m.Value).ToArray(); string[] cols = Regex.Matches(str[1], @"\<td.*?\</td>").Cast<Match>().Select(m => m.Value).ToArray(); Tag title = tdparse(Regex.Matches(str[0], @"\<td.*?\</td>").Cast<Match>().Select(m => m.Value).ToArray()[0]); string tableName = GetValueOfTag(title.Value,"a"); DataTable output = new DataTable(tableName); foreach (var col in cols) { output.Columns.Add(Regex.Replace(col, "<.*?>", String.Empty)); } //List<List<Tag>> prev_cell = new List<List<Tag>>(); for (int row = 3; row < str.Length; row++) { List<string> cells = Regex.Matches(Regex.Replace(str[row], "<br>", ""), @"\<td.*?\</td>").Cast<Match>().Select(m => m.Value).ToList(); //List<Tag> cells_tag = new List<Tag>(); DataRow dtrow = output.NewRow(); for (int column = 0; column < cells.Count/*cols.Length*/; column++) { Tag tdcell = new Tag();/* if ((prev_cell.Count > 0) &&prev_cell[row - 4][column].Attributes.ContainsKey("rowspan") && (Convert.ToInt32(prev_cell[row - 4][column].Attributes["rowspan"]) > 0)) { prev_cell[row - 4][column].Attributes["rowspan"] = (Convert.ToInt32(prev_cell[row - 4][column].Attributes["rowspan"]) - 1).ToString(); tdcell = prev_cell[row - 4][column]; cells.Insert(column, maketag(prev_cell[row - 4][column])); } else {*/ tdcell = tdparse(cells[column]); //} dtrow[column] = tdcell.Value; //cells_tag.Add(tdcell); } output.Rows.Add(dtrow); //prev_cell.Add(cells_tag); } return output; } #region Private Methods private string OnlyTable(string input, string tag) { string opentag = "<" + tag + " "; string closetag = "</" + tag + ">"; string output = string.Empty; string[] temp = System.Text.RegularExpressions.Regex.Split(input, opentag); foreach (string s in temp) { string str = string.Empty; if (s.Contains(closetag)) { str = opentag + s.Substring(0, s.IndexOf(closetag)) + closetag; } if (str.Trim() != string.Empty) { output = output + str.Trim(); } } output = output.Replace(System.Environment.NewLine, ""); return output; } private string RemoveHTMLComments(string input) { string output = string.Empty; string[] temp = System.Text.RegularExpressions.Regex.Split(input, "<!--"); foreach (string s in temp) { string str = string.Empty; if (!s.Contains("-->")) { str = s; } else { str = s.Substring(s.IndexOf("-->") + 3); } if (str.Trim() != string.Empty) { output = output + str.Trim(); } } return output; } private Tag tdparse(string input) { Tag output = new Tag() { Name = "td" }; output.Value = GetValueOfTag(input, output.Name); output.Attributes = GetAttributesOfTag(input, output.Name); return output; } private string maketag(Tag input) { string output = string.Empty; string attributes = string.Empty; foreach (var attr in input.Attributes) { attributes += attr.Key + "=" + attr.Value + " "; } output = "<" + input.Name + " " + attributes + ">" + input.Value + "</" + input.Name + ">"; return output; } private Dictionary<string, string> GetAttributesOfTag(string TagStr, string TagName) { var output = new Dictionary<string, string>(); string cell = Regex.Matches(TagStr, @"\<" + TagName + @".*?\>").Cast<Match>().Select(m => m.Value).ToList()[0]; cell = Regex.Replace(cell,@"\<"+TagName,""); cell = Regex.Replace(cell, @"\>", ""); string s = cell.Substring(0, cell.IndexOf(@"='") + 2) + Regex.Replace(cell.Substring(cell.IndexOf(@"='") + 2, cell.Length - (cell.IndexOf(@"='") + 2))," ",""); string[] ss = s.Split(' '); foreach(var item in ss) { if (string.IsNullOrWhiteSpace(item)) continue; output.Add(item.Split('=')[0], item.Split('=')[1]); } return output; } private string GetValueOfTag(string TagStr,string TagName) { string output = ""; string value = Regex.Replace(TagStr, @"\<" + TagName + @".*?\>", ""); output = Regex.Replace(value, @"\</" + TagName + @"\>", ""); return output; } #endregion } class Tag { public string Name { get; set; } public Dictionary<string,string> Attributes { get; set; } public string Value { get; set; } } Please tell me how to fix this bug.
PS: I know about third-party libraries, so do not offer.