Structured prediction models have been found to learn effectively from a few large examples\ {\textemdash} sometimes even just one. Despite empirical evidence, canonical learning theory cannot guarantee generalization in this setting because the error bounds decrease as a function of the number of examples. We therefore propose new PAC-Bayesian generalization bounds for structured prediction that decrease as a function of both the number of examples and the size of each example. Our analysis hinges on the stability of joint inference and the smoothness of the data distribution. We apply our bounds to several common learning scenarios, including max-margin and soft-max training of Markov random fields. Under certain conditions, the resulting error bounds can be far more optimistic than previous results and can even guarantee generalization from a single large example.

}, keywords = {PAC-Bayes, generalization bounds, learning theory, structured prediction}, author = {Ben London and Bert Huang and Lise Getoor} } @conference {pujara:uai15, title = {Budgeted Online Collective Inference}, booktitle = {UAI}, year = {2015}, abstract = {Updating inference in response to new evidence is a fundamental challenge in artificial intelligence. Many real problems require large probabilistic graphical models, containing millions of interdependent variables. For such large models, jointly updating the most likely (i.e., MAP) configuration of the variables each time new evidence is encountered can be infeasible, even if inference is tractable. In this paper, we introduce budgeted online collective inference, in which the MAP configuration of a graphical model is updated efficiently by revising the assignments to a subset of the variables while holding others fixed. The goal is to selectively update certain variables without sacrificing quality with respect to full inference. To formalize the consequences of partially updating inference, we introduce the concept of inference regret. We derive inference regret bounds for a class of graphical models with strongly-convex free energies. These theoretical insights, combined with a thorough analysis of the optimization solver, motivate new approximate methods for efficiently updating the variable assignments under a budget constraint. In experiments, we demonstrate that our algorithms can reduce inference time by 65\% with accuracy comparable to full inference.

}, author = {Jay Pujara and Ben London and Lise Getoor} } @article {namata:tkdd15, title = {Collective Graph Identification}, journal = {TKDD}, volume = {10}, number = {3}, year = {2015}, chapter = {1--36}, abstract = {Data describing networks{\textemdash}such as communication networks, transaction networks, disease transmission networks, collaboration networks, etc.{\textemdash}are becoming increasingly available. While observational data can be useful, it often only hints at the actual underlying process that governs interactions and attributes. For example, an email communication network provides insight into its users and their relationships, but is not the same as the {\textquotedblleft}real{\textquotedblright} underlying social network. In this article, we introduce the problem of graph identification, i.e., discovering the latent graph structure underlying an observed network. We cast the problem as a probabilistic inference task, in which we must infer the nodes, edges, and node labels of a hidden graph, based on evidence. This entails solving several canonical problems in network analysis: entity resolution (determining when two observations correspond to the same entity), link prediction (inferring the existence of links), and node labeling (inferring hidden attributes). While each of these subproblems has been well studied in isolation, here we consider them as a single, collective task. We present a simple, yet novel, approach to address all three subproblems simultaneously. Our approach, which we refer to as C3, consists of a collection of Coupled Collective Classifiers that are applied iteratively to propagate inferred information among the subproblems. We consider variants of C3 using different learning and inference techniques and empirically demonstrate that C3 is superior, both in terms of predictive accuracy and running time, to state-of-the-art probabilistic approaches on four real problems.

}, author = {Galileo Namata and Ben London and Lise Getoor} } @conference {london:icml15, title = {The Benefits of Learning with Strongly Convex Approximate Inference}, booktitle = {ICML}, year = {2015}, abstract = {We explore the benefits of strongly convex free energies in variational inference, providing both theoretical motivation and a new meta-algorithm. Using the duality between strong convexity and stability, we prove a high-probability bound on the error of learned marginals that is inversely proportional to the modulus of convexity of the free energy, thereby motivating free energies whose moduli are constant with respect to the size of the graph. We identify sufficient conditions for Ω(1)-strong convexity in two popular variational techniques: tree-reweighted and counting number entropies. Our insights for the latter suggest a novel counting number optimization framework, which guarantees strong convexity for any given modulus. Our experiments demonstrate that learning with a strongly convex free energy, using our optimization framework to guarantee a given modulus, results in substantially more accurate marginal probabilities, thereby validating our theoretical claims and the effectiveness of our framework.

}, author = {Ben London and Bert Huang and Lise Getoor} } @conference {london:sptli13, title = {Collective Activity Detection using Hinge-loss Markov Random Fields}, booktitle = {CVPR Workshop on SPTLE}, year = {2013}, abstract = {We propose hinge-loss Markov random fields (HL-MRFs), a powerful class of continuous-valued graphical models, for high-level computer vision tasks. HL-MRFs are characterized by log-concave density functions, and are able to perform efficient, exact inference. Their templated hinge-loss potential functions naturally encode soft-valued logical rules. Using the declarative modeling language probabilistic soft logic, one can easily define HL-MRFs via familiar constructs from first-order logic. We apply HL-MRFs to the task of activity detection, using principles of collective classification. Our model is simple, intuitive and interpretable. We evaluate our model on two datasets and show that it achieves significant lift over the low-level detectors.

}, author = {Ben London and Sameh Khamis and Stephen Bach and Bert Huang and Lise Getoor and Larry Davis} } @book {london:book13, title = {Collective Classification of Network Data}, series = {Data Classification: Algorithms and Applications}, volume = {1}, year = {2013}, note = {May differ from the published version}, pages = {399--416}, publisher = {CRC Press}, organization = {CRC Press}, edition = {1}, chapter = {15}, author = {Ben London and Lise Getoor}, editor = {Charu Aggarwal} } @conference {london:icml13, title = {Collective Stability in Structured Prediction: Generalization from One Example}, booktitle = {ICML}, year = {2013}, abstract = {Structured predictors enable joint inference over multiple interdependent output variables. These models are often trained on a small number of examples with large internal structure. Existing distribution-free generalization bounds do not guarantee generalization in this setting, though this contradicts a large body of empirical evidence from computer vision, natural language processing, social networks and other fields. In this paper, we identify a set of natural conditions {\textendash} weak dependence, hypothesis complexity and a new measure, collective stability {\textendash} that are sufficient for generalization from even a single example, without imposing an explicit generative model of the data. We then demonstrate that the complexity and stability conditions are satisfied by a broad class of models, including marginal inference in templated graphical models. We thus obtain uniform convergence rates that can decrease significantly faster than previous bounds, particularly when each structured example is sufficiently large and the number of training examples is constant, even one.

}, author = {Ben London and Bert Huang and Benjamin Taskar and Lise Getoor} } @conference {huang:slg13, title = {Empirical Analysis of Collective Stability}, booktitle = {ICML Workshop on SLG}, year = {2013}, abstract = {When learning structured predictors, collective stability is an important factor for generalization. London et al. (2013) provide the first analysis of this effect, proving that collectively stable hypotheses produce less deviation between empirical risk and true risk, i.e., defect. We test this effect empirically using a collectively stable variant of maxmargin Markov networks. Our experiments on webpage classification validate that increasing the collective stability reduces the defect and can thus lead to lower overall test error

}, author = {Bert Huang and Ben London and Benjamin Taskar and Lise Getoor} } @conference {london:nips12spectral, title = {Multi-relational Weighted Tensor Decomposition}, booktitle = {NIPS Workshop on SL}, year = {2012}, author = {Ben London and Theodoros Rekatsinas and Bert Huang and Lise Getoor} } @conference {namata:mlg12-wkshp, title = {Query-driven Active Surveying for Collective Classification}, booktitle = {ICML Workshop on MLG}, year = {2012}, abstract = {In network classification problems such as those found in intelligence gathering, public health, and viral marketing, one is often only interested in inferring the labels of a subset of the nodes. We refer to this subset as the query set, and define the problem as query-driven collective classification. We study this problem in a practical active learning framework, in which the learning algorithm can survey non-query nodes to obtain their labels and network structure. We derive a surveying strategy aimed toward optimal inference on the query set. Considering both feature and structural smoothness, concepts that we formally define, we develop an algorithm which adaptively selects survey nodes by estimating which form of smoothness is most appropriate. We evaluate our algorithm on several network datasets and demonstrate its improvements over standard active learning methods.

}, author = {Galileo Namata and Ben London and Lise Getoor and Bert Huang} }