今天针对PIPE组对数据表的修改,对建立倒排索引做了系统的修改,由于表DOC、VEDIO、QUESTION(由QAPAIR修改为QUESTION)的属性并不完全相同,处理数据方法进行少量修改:
DOC表和VEDIO表具有的相同属性:title;
DOC独有属性:author,keywords;
QUESTION独有属性:question;
3个表最后的到的倒排索引结构式相同的,得到WORDLIST和对应ID;
以下功能整合到分词模块和更新倒排索引模块中
//分词 static private ListgetWords(int type, SqlDataReader reader) { List listall = new List (); if (type == 0) { string title = reader[_Title].ToString(); string keyword = reader[_KeyWords].ToString(); string author = reader[_Author].ToString(); //string description = reader[_Description].ToString(); List list1 = ChineseWordSegmentation.word_segmentation(title); List list2 = keyword.Split(new char[2] { ' ', ':' }, StringSplitOptions.RemoveEmptyEntries).ToList(); List list3 = author.Split(new char[2] { ' ', '.' }, StringSplitOptions.RemoveEmptyEntries).ToList(); //List list4 = ChineseWordSegmentation.word_segmentation(description); //listall = list1.Union(list2).Union(list3).Union(list4).ToList(); listall = list1.Union(list2).Union(list3).ToList(); } else if (type == 1) { string title = reader[_Title].ToString(); //string description = reader[_Description].ToString(); //List list1 = ChineseWordSegmentation.word_segmentation(title); //List list2 = ChineseWordSegmentation.word_segmentation(description); //listall = list1.Union(list2).ToList(); listall = ChineseWordSegmentation.word_segmentation(title); } else { string question = reader[_Question].ToString(); listall = ChineseWordSegmentation.word_segmentation(question); } return listall; } //更新倒排索引 static private void updateIndex(List words, SqlConnection connection, string ID) { SqlCommand cmd = new SqlCommand(); cmd.Connection = connection; foreach (string word in words) { //倒排表中加入新关键词 cmd.CommandText = "SELECT value FROM index3 WHERE value = word"; object val = cmd.ExecuteScalar(); if (val == System.DBNull.Value) //if(cmd.ExecuteScalar() is DBNull) { cmd.CommandText = "INSERT INTO index3 VALUES(word, ID)"; cmd.ExecuteNonQuery(); } //倒排索引中存在的关键词,加上属性ID信息 else { string newValue = val.ToString() + "," + ID; cmd.CommandText = "UPDATE index3 SET value = newValue WHERE key = word"; cmd.ExecuteNonQuery(); } } }
主函数部分:
1 ListresultList = new List (); 2 string connectionString = GetConnectionString(); //SQL Server链接字符串 3 using (SqlConnection connection = new SqlConnection(connectionString)) //SQL链接类的实例化 4 { 5 connection.Open(); //打开数据库 6 //建立倒排表 7 string sqlstr = "CREATE table index_doc(key varchar(50) primary key, ID varchar(50))"; 8 SqlCommand cmd = new SqlCommand(); 9 cmd.Connection = connection;10 cmd.CommandText = sqlstr;11 cmd.ExecuteNonQuery();12 sqlstr = "CREATE table index_vedio(key varchar(50) primary key, ID varchar(50))";13 cmd.CommandText = sqlstr;14 cmd.ExecuteNonQuery();15 sqlstr = "CREATE table index_question(key varchar(50) primary key, ID varchar(50))";16 cmd.CommandText = sqlstr;17 cmd.ExecuteNonQuery();18 19 for (int i = 0; i < 3;i++ )20 {21 string table = "";22 if (i == 0) table = _TableDoc;23 else if (i == 1) table = _TableVideo;24 else table = _TableQuestion;25 //读取顺序表26 sqlstr = "SELECT * FROM" + table;27 cmd.CommandText = sqlstr;28 SqlDataReader reader = cmd.ExecuteReader(); 29 try30 {31 while (reader.Read())32 {33 string ID = reader[_ID].ToString();34 //分词处理35 List words = getWords(i, reader);36 //将keyword信息添加到倒排表37 updateIndex(words, connection, ID);38 }39 }40 finally41 {42 // Always call Close when done reading.43 reader.Close();44 }45 }