@book {namata:tmbook09, title = {Collective Classification for Text Classification}, series = {Text Mining: Classification, Clustering, and Applications}, volume = {1}, year = {2009}, pages = {51--69}, publisher = {Taylor and Francis Group}, organization = {Taylor and Francis Group}, edition = {1}, chapter = {3}, abstract = {

Text classification, the classification of text documents according to categories or topics, is an important component of any text processing system. There is a large body of work which makes use of content{\textendash}the words appearing in the documents, the structure of the documents{\textendash}and external sources to build accurate document classifiers. In addition, there is a growing body of literature on methods which attempt to make use of the link structure among the documents in order to improve document classification performance. Text documents can be connected together in a variety of ways. The most common link structure is the citation graph: eg, papers cite other papers and webpages link to other webpages. But links among papers can be constructed from other relationships such as co-author, co-citation, appearance at a conference venue, and others. All of these can be combined together to create a interlinked collection of text documents. In these cases, we are often not interested in determining the topic of just a single document, but we have a collection of unlabeled (or partially labeled) documents, and we want to correctly infer values for all of the missing labels.

}, author = {Galileo Namata and Prithviraj Sen and Mustafa Bilgic and Lise Getoor}, editor = {Mehran Sahami and Ashok Srivastava} } @book {deshpande:mmudchapter09, title = {Graphical Models for Uncertain Data}, series = {Managing and Mining Uncertain Data}, volume = {1}, year = {2009}, pages = {1--34}, publisher = {Springer}, organization = {Springer}, edition = {1}, chapter = {1}, abstract = {

Graphical models are a popular and well-studied framework for compact representation of a joint probability distribution over a large number of interdependent variables, and for efficient reasoning about such a distribution. They have been proven useful in a wide range of domains from natural language processing to computer vision to bioinformatics. In this chapter, we present an approach to using graphical models for managing and querying large-scale uncertain databases. We present a unified framework based on the concepts from graphical models that can model not only tuple-level and attribute-level uncertainties, but can also handle arbitrary correlations that may be present among the data; our framework can also naturally capture shared correlations where the same uncertainties and correlations occur repeatedly in the data. We develop an efficient strategy for query evaluation over such probabilistic databases by casting the query processing problem as an inference problem in an appropriately constructed graphical model, and present optimizations specific to probabilistic databases that enable efficient query evaluation. We conclude the chapter with a discussion of related and future work on these topics.

}, author = {Amol Deshpande and Lise Getoor and Prithviraj Sen}, editor = {Charu Aggarwal} }