@article {353, title = {A Collective, Probabilistic Approach to Schema Mapping Using Diverse Noisy Evidence}, journal = {IEEE Transactions on Knowledge and Data Engineering (TKDE)}, volume = {31}, year = {2019}, pages = {1426--1439}, abstract = {We propose a probabilistic approach to the problem of schema mapping. Our approach is declarative, scalable, and extensible. It builds upon recent results in both schema mapping and probabilistic reasoning and contributes novel techniques in both fields. We introduce the problem of schema mapping selection, that is, choosing the best mapping from a space of potential mappings, given both metadata constraints and a data example. As selection has to reason holistically about the inputs and the dependencies between the chosen mappings, we define a new schema mapping optimization problem which captures interactions between mappings as well as inconsistencies and incompleteness in the input. We then introduce Collective Mapping Discovery (CMD), our solution to this problem using state-of-the-art probabilistic reasoning techniques. Our evaluation on a wide range of integration scenarios, including several real-world domains, demonstrates that CMD effectively combines data and metadata information to infer highly accurate mappings even with significant levels of noise.}, keywords = {Cognition, Complexity theory, Data engineering, Knowledge engineering, Metadata, Probabilistic logic, Schema mapping, Task analysis, collective mapping discovery, data integration, inference mechanisms, meta data, optimisation, optimization, potential mappings, probabilistic reasoning techniques, probability, schema mapping optimization problem, uncertainty handling}, doi = {10.1109/TKDE.2018.2865785}, author = {Angelika Kimmig and Alex Memory and Renee J Miller and Lise Getoor} } @article {331, title = {Collective Entity Resolution in Multi-Relational Familial Networks}, journal = {Knowledge and Information Systems (KAIS)}, volume = {61}, year = {2018}, pages = {1547-{\textendash}1581}, abstract = {

Entity resolution in settings with rich relational structure often introduces complex dependencies between co-references. Exploiting these dependencies is challenging -- it requires seamlessly combining statistical, relational, and logical dependencies.\ One task of particular interest is entity resolution\ in familial networks. \ In this setting, \ multiple partial representations of a family tree are provided, from the perspective of different family members, and the challenge is to reconstruct a family tree from these multiple, noisy, partial views. \  This reconstruction is crucial for applications such as understanding genetic inheritance,\ tracking disease contagion, \ and performing census surveys.\ Here, we design a model that incorporates statistical signals (such as name similarity), relational information (such as sibling overlap), logical constraints (such as transitivity and bijective matching), and predictions from other algorithms (such as logistic regression and support vector machines), in a collective model. We show how to integrate these features using probabilistic soft logic, a scalable probabilistic programming framework. In experiments on real-world data, our model significantly outperforms state-of-the-art classifiers that use relational features but are incapable of collective reasoning.

}, author = {Pigi Kouki and Jay Pujara and Christopher Marcum and Laura Koehly and Lise Getoor} } @conference {334, title = {The Impact of Environmental Stressors on Human Trafficking}, booktitle = {ICWSM Workshop on Beyond Online Data (BOD)}, year = {2018}, abstract = {

Severe environmental events have extreme effects on all segments of society, including criminal activity. Extreme weather events, such as tropical storms, fires, and floods create instability in communities, and can be exploited by criminal organizations. Here we investigate the potential impact of catastrophic storms on the criminal activity of human trafficking. We propose three theories of how these catastrophic storms might impact trafficking and provide evidence for each. Researching human trafficking is made difficult by its illicit nature and the obscurity of high-quality data. Here, we analyze online advertisements for services which can be collected at scale and provide insights into traffickers{\textquoteright} behavior. To successfully combine relevant heterogenous sources of information, as well as spatial and temporal structure, we propose a collective, probabilistic approach. We implement this approach with Probabilistic Soft Logic, a probabilistic programming framework which can flexibly model relational structure and for which inference of future locations is highly efficient. Furthermore, this framework can be used to model hidden structure, such as latent links between locations. Our proposed approach can model and predict how traffickers move. In addition, we propose a model which learns connections between locations. This model is then adapted to have knowledge of environmental events, and we demonstrate that incorporating knowledge of environmental events can improve prediction of future locations. While we have validated our models on the impact of severe weather on human trafficking, we believe our models can be generalized to a variety of other settings in which environmental events impact human behavior.

}, author = {Tomkins, Sabina and Golnoosh Farnadi and Brian Amantullah and Lise Getoor and Steven Minton} } @conference {338, title = {The Impact of Environmental Stressors on Human Trafficking}, booktitle = {International Conference on Data Mining (ICDM)}, year = {2018}, abstract = {

{\textemdash}Severe environmental events have extreme effects on all segments of society, including criminal activity. Extreme weather events, such as tropical storms, fires, and floods create instability in communities, and can be exploited by criminal organizations. Here we investigate the potential impact of catastrophic storms on the criminal activity of human trafficking. We propose three theories of how these catastrophic storms might impact trafficking and provide evidence for each. Researching human trafficking is made difficult by its illicit nature and the obscurity of high-quality data. Here, we analyze online advertisements for services which can be collected at scale and provide insights into traffickers{\textquoteright} behavior. To successfully combine relevant heterogenous sources of information, as well as spatial and temporal structure, we propose a collective, probabilistic approach. We implement this approach with Probabilistic Soft Logic, a probabilistic programming framework which can flexibly model relational structure and for which inference of future locations is highly efficient. Furthermore, this framework can be used to model hidden structure, such as latent links between locations. Our proposed approach can model and predict how traffickers move. In addition, we propose a model which learns connections between locations. This model is then adapted to have knowledge of environmental events, and we demonstrate that incorporating knowledge of environmental events can improve prediction of future locations. While we have validated our models on the impact of severe weather on human trafficking, we believe our models can be generalized to a variety of other settings in which environmental events impact human behavior

}, author = {Tomkins, Sabina and Golnoosh Farnadi and Brian Amantullah and Lise Getoor and Steven Minton} } @conference {kouki:icdm17, title = {Collective Entity Resolution in Familial Networks}, booktitle = {IEEE International Conference on Data Mining (ICDM)}, year = {2017}, note = {To Appear}, abstract = {

Entity resolution in settings with rich relational structure often introduces complex dependencies between coreferences. Exploiting these dependencies is challenging {\textendash} it requires seamlessly combining statistical, relational, and logical dependencies. One task of particular interest is entity resolution in familial networks. In this setting, multiple partial representations of a family tree are provided, from the perspective of different family members, and the challenge is to reconstruct a family tree from these multiple, noisy, partial views. This reconstruction is crucial for applications such as understanding genetic inheritance, tracking disease contagion, and performing census surveys. Here, we design a model that incorporates statistical signals, such as name similarity, relational information, such as sibling overlap, and logical constraints, such as transitivity and bijective matching, in a collective model. We show how to integrate these features using probabilistic soft logic, a scalable probabilistic programming framework. In experiments on realworld data, our model significantly outperforms state-of-theart classifiers that use relational features but are incapable of collective reasoning. I

}, url = {https://github.com/pkouki/icdm2017}, author = {Kouki, Pigi and Pujara, Jay and Marcum, Christopher and Koehly, Laura and Lise Getoor} } @conference {kimmig:icde17, title = {A Collective, Probabilistic Approach to Schema Mapping}, booktitle = {International Conference on Data Engineering (ICDE)}, year = {2017}, url = {https://github.com/alexmemory/kimmig-icde17/wiki}, author = {Kimmig, Angelika and Memory, Alex and Miller, Renee and Lise Getoor} } @article {farnadi:mlj17, title = {Soft quantification in statistical relational learning}, journal = {Machine Learning Journal}, year = {2017}, author = {Golnoosh Farnadi and Bach, Stephen H. and Moens, Marie-Francine and Lise Getoor and De Cock, Martine} } @article {muthiah:aimag16, title = {Capturing Planned Protests from Open Source Indicators}, journal = {AI Mag}, volume = {37}, number = {2}, year = {2016}, pages = {63{\textendash}75}, abstract = {

Civil unrest events (protests, strikes, and {\textquotedblleft}occupy{\textquotedblright} events) are common occurrences in both democracies and authoritarian regimes. The study of civil unrest is a key topic for political scientists as it helps capture an important mechanism by which citizenry express themselves. In countries where civil unrest is lawful, qualitative analysis has revealed that more than 75 percent of the protests are planned, organized, or announced in advance; therefore detecting references to future planned events in relevant news and social media is a direct way to develop a protest forecasting system. We report on a system for doing that in this article. It uses a combination of keyphrase learning to identify what to look for, probabilistic soft logic to reason about location occurrences in extracted results, and time normalization to resolve future time mentions. We illustrate the application of our system to 10 countries in Latin America: Argentina, Brazil, Chile, Colombia, Ecuador, El Salvador, Mexico, Paraguay, Uruguay, and Venezuela. Results demonstrate our successes in capturing significant societal unrest in these countries with an average lead time of 4.08 days. We also study the selective superiorities of news media versus social media (Twitter, Facebook) to identify relevant trade-offs.

}, author = {Sathappan Muthiah and Bert Huang and Jaime Arredondo and David Mares and Lise Getoor and Graham Katz and Naren Ramakrishnan} } @conference {kouki:mlg16, title = {Entity Resolution in Familial Networks}, booktitle = {MLG}, year = {2016}, month = {2016}, abstract = {

Entity resolution is an important graph mining problem. Entity resolution is particularly interesting and challenging when there is rich relational structure. In this paper, we study the problem of performing entity resolution in familial networks. In our setting, we are given partial views of a familial network as described from\  \  the point of view of different people in the network and our goal is to reconstruct the underlying familial network from these perspective partial views. The data and relations provided may be inaccurate, missing or incomplete. In our approach, we start by augmenting the known set of familial relations with additional ones that are either inversed or derived from the original set of relations by linkage heuristics. Additionally, we propose a set of measures that capture the similarity of persons in the familial network based on both personal and relational information. We present a supervised learning approach where we view entity resolution in familial networks as a classification problem. Our experiments on real-world data from multiple-informant pedigrees show that our approach works well and that we can improve performance by considering separate similarity scores for each relation type.

}, author = {Pigi Kouki and Christopher Marcum and Laura Koehly and Lise Getoor} } @conference {kumar:asonam16, title = {Unsupervised Models for Predicting Strategic Relations between Organizations}, booktitle = {ASONAM}, year = {2016}, abstract = {

Microblogging sites like Twitter provide a platform for sharing ideas and expressing opinions. The widespread popularity of these platforms and the complex social structure that arises within these communities provides a unique opportunity to understand the interactions between users. The political domain, especially in a multi-party system, presents compelling challenges, as political parties have different levels of alignment based on their political strategies. We use Twitter to understand the nuanced relationships between differing political entities in Latin America. Our model incorporates diverse signals from the content of tweets and social context from retweets, mentions and hashtag usage. Since direct communications between entities are relatively rare, we explore models based on the posts of users who interact with multiple political organizations. We present a quantitative and qualitative analysis of the results of models using different features, and demonstrate that a model capable of using sentiment strength, social context, and issue alignment has superior performance to less sophisticated baselines.

}, author = {Shachi Kumar and Jay Pujara and Lise Getoor and David Mares and Dipak Gupta and Ellen Riloff} } @article {rekatsinas:sam2016, title = {Forecasting Rare Disease Outbreaks Using Multiple Data Sources}, journal = {STAT ANAL DATA MIN}, year = {2015}, note = {Best of SDM 2015, Special Issue}, chapter = {379}, abstract = {

Rapidly increasing volumes of news feeds from diverse data sources, such as online newspapers, Twitter and online blogs are proving to be extremely valuable resources in helping anticipate, detect, and forecast outbreaks of rare diseases. This paper presents SourceSeer, a novel algorithmic framework that combines spatio-temporal topic models with sourcebased anomaly detection techniques to effectively forecast the emergence and progression of infectious rare diseases. SourceSeer is capable of discovering the location focus of each source allowing sources to be used as experts with varying degrees of authoritativeness. To fuse the individual source predictions into a final outbreak prediction we employ a multiplicative weights algorithm taking into account the accuracy of each source. We evaluate the performance of SourceSeer using incidence data for hantavirus syndromes in multiple countries of Latin America provided by HealthMap over a timespan of fifteen months. We demonstrate that SourceSeer makes predictions of increased accuracy compared to several baselines and is capable of forecasting disease outbreaks in a timely manner even when no outbreaks were previously reported.

}, author = {Theodoros Rekatsinas and Saurav Ghosh and Sumiko Mekaru and Elaine Nsoesie and John Brownstein and Lise Getoor and Naren Ramakrishnan} } @article {kimmig:mlj15, title = {Lifted graphical models: a survey}, journal = {Machine Learning Journal}, volume = {99}, number = {1}, year = {2015}, pages = {1{\textendash}45}, author = {Kimmig, Angelika and Mihalkova, Lilyana and Lise Getoor} } @conference {rekatsinas:sdm15, title = {SourceSeer: Forecasting Rare Disease Outbreaks Using Multiple Data Sources}, booktitle = {2015 SIAM International Conference on Data Mining (SDM15)}, year = {2015}, note = {Best Research Paper Award}, publisher = {SIAM}, organization = {SIAM}, author = {Rekatsinas, Theodoros and Ghosh, Saurav and Mekaru, Sumiko and Nsoesie, Elaine and Brownstein, John and Lise Getoor and Ramakrishnan, Naren} } @conference {farnadi:ilp15, title = {Statistical Relational Learning with Soft Quantifiers}, booktitle = {International Conference on Inductive Logic Programming (ILP)}, year = {2015}, note = {Winner of Best Student Paper award.}, author = {Golnoosh Farnadi and Bach, Stephen H. and Blondeel, Marjon and Moens, Marie-Francine and Lise Getoor and De Cock, Martine} } @article {pujara:aimag15, title = {Using Semantics \& Statistics to Turn Data into Knowledge}, journal = {AI Magazine}, volume = {36}, number = {1}, year = {2015}, pages = {65{\textendash}74}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {ramakrishnan:kdd14, title = {{\textquoteleft}Beating the news{\textquoteright} with EMBERS: Forecasting Civil Unrest using Open Source Indicators}, booktitle = {ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, year = {2014}, abstract = {

We describe the design, implementation, and evaluation of EMBERS, an automated, 24x7 continuous system for forecasting civil unrest across 10 countries of Latin America using open source indicators such as tweets, news sources, blogs, economic indicators, and other data sources. Unlike retrospective studies, EMBERS has been making forecasts into the future since Nov 2012 which have been (and continue to be) evaluated by an independent T\&E team (MITRE). Of note, EMBERS has successfully forecast the uptick and downtick of incidents during the June 2013 protests in Brazil. We outline the system architecture of EMBERS, individual models that leverage specific data sources, and a fusion and suppression engine that supports trading off specific evaluation criteria. EMBERS also provides an audit trail interface that enables the investigation of why specific predictions were made along with the data utilized for forecasting. Through numerous evaluations, we demonstrate the superiority of EMBERS over baserate methods and its capability to forecast significant societal happenings.\ 

}, author = {Ramakrishnan, Naren and Butler, Patrick and Self, Nathan and Khandpur, Rupinder and Saraf, Parang and Wang, Wei and Cadena, Jose and Vullikanti, Anil and Korkmaz, Gizem and Kuhlman, Christopher and Marathe, Achla and Zhao, Liang and Ting, Hua and Huang, Bert and Srinivasan, Aravind and Trinh, Khoa and Lise Getoor and Katz, Graham and Doyle, Andy and Ackermann, Chris and Zavorin, Ilya and Ford, Jim and Summers, Kristin and Fayed, Youssef and Arredondo, Jaime and Gupta, Dipak and Mares, David} } @conference {farnadi:starai14, title = {Extending PSL with Fuzzy Quantifiers}, booktitle = {International Workshop on Statistical Relational Artificial Intelligence (StaRAI)}, year = {2014}, author = {Golnoosh Farnadi and Bach, Stephen H. and Moens, Marie-Francine and Lise Getoor and De Cock, Martine} } @article {kimmig:machinelearning2004, title = {Lifted graphical models: a survey}, journal = {Machine Learning}, year = {2014}, pages = {1-45}, keywords = {First-order probabilistic models, Lifted inference and learning, Par-factor graphs, Probabilistic programming, Statistical relational learning, Templated graphical models}, author = {Kimmig, Angelika and Mihalkova, Lilyana and Lise Getoor} } @conference {moustafa:icde14, title = {Subgraph Pattern Matching over Uncertain Graphs with Identity Linkage Uncertainty}, booktitle = {International Conference on Data Engineering (ICDE)}, year = {2014}, author = {Moustafa, Walaa Eldin and Kimmig, Angelika and Deshpande, Amol and Lise Getoor} } @conference {getoor:kdd13, title = {Entity Resolution in Big Data}, booktitle = {KDD}, year = {2013}, note = {Slides: http://www.umiacs.umd.edu/~getoor/Tutorials/ER_KDD2013.pdf}, abstract = {

Entity resolution (ER), the problem of extracting, matching and resolving entity mentions in structured and unstructured data, is a long-standing challenge in database management, information retrieval, machine learning, natural language processing and statistics. Accurate and fast entity resolution has huge practical implications in a wide variety of commercial, scientific and security domains. Despite the long history of work on entity resolution, there is still a surprising diversity of approaches, and lack of guiding theory. Meanwhile, in the age of big data, the need for high quality entity resolution is growing, as we are inundated with more and more data, all of which needs to be integrated, aligned and matched, before further utility can be extracted. In this tutorial, we bring together perspectives on entity resolution from a variety of fields, including databases, information retrieval, natural language processing and machine learning, to provide, in one setting, a survey of a large body of work. We discuss both the practical aspects and theoretical underpinnings of ER. We describe existing solutions, current challenges and open research problems. In addition to giving attendees a thorough understanding of existing ER models, algorithms and evaluation methods, the tutorial will cover important research topics such as scalable ER, active and lightly supervised ER, and query-driven ER.

}, author = {Lise Getoor and Ashwin Machanavajjhala} } @conference {moustafa:sigmod13, title = {GrDB: A System for Declarative and Interactive Analysis of Noisy Information Networks}, booktitle = {SIGMOD}, year = {2013}, abstract = {

There is a growing interest in methods for analyzing data describing networks of all types, including biological, physical, social, and scientific collaboration networks. Typically the data describing these networks is observational, and thus noisy and incomplete; it is often at the wrong level of fidelity and abstraction for meaningful data analysis. This demonstration presents GrDB, a system that enables data analysts to write declarative programs to specify and combine different network data cleaning tasks, visualize the output, and engage in the process of decision review and correction if necessary. The declarative interface of GrDB makes it very easy to quickly write analysis tasks and execute them over data, while the visual component facilitates debugging the program and performing fine grained corrections.

}, author = {Walaa Moustafa and Hui Miao and Amol Deshpande and Lise Getoor} } @conference {miao:bigdata13, title = {A Hypergraph-Partitioned Vertex Programming Approach for Large-scale Consensus Optimization}, booktitle = {2013 IEEE International Conference on Big Data}, year = {2013}, author = {Miao, Hui and Liu, Xiangyang and Huang, Bert and Lise Getoor} } @conference {pujara:wtbudg13, title = {Joint Judgments with a Budget: Strategies for Reducing the Cost of Inference}, booktitle = {ICML Workshop on Machine Learning with Test-Time Budgets}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor} } @conference {pujara:iswc13, title = {Knowledge Graph Identification}, booktitle = {International Semantic Web Conference (ISWC)}, year = {2013}, note = {Winner of Best Student Paper award}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:slg13, title = {Large-Scale Knowledge Graph Identification using PSL}, booktitle = {ICML Workshop on Structured Learning (SLG)}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:sbd13, title = {Large-Scale Knowledge Graph Identification using PSL}, booktitle = {AAAI Fall Symposium on Semantics for Big Data}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:akbc13, title = {Ontology-Aware Partitioning for Knowledge Graph Identification}, booktitle = {CIKM Workshop on Automatic Knowledge Base Construction}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {moustafa:icde12, title = {Ego-centric Graph Pattern Census}, booktitle = {International Conference on Data Engineering (ICDE)}, year = {2012}, author = {Moustafa, Walaa Eldin and Deshpande, Amol and Lise Getoor} } @conference {getoor:vldb12, title = {Entity Resolution: Theory, Practice \& Open Challenges}, booktitle = {International Conference on Very Large Data Bases}, year = {2012}, note = {Slides: http://www.cs.umd.edu/~getoor/Tutorials/ER_VLDB2012.pdf}, author = {Lise Getoor and Machanavajjhala, Ashwin} } @conference {getoor:aaai12t, title = {Entity Resolution: Theory, Practice, and Open Challenges}, booktitle = {AAAI Conference on Artificial Intelligence}, year = {2012}, note = {URL: http://www.cs.umd.edu/projects/linqs/Tutorials/ER-AAAI12/Home.html}, author = {Lise Getoor and Machanavajjhala, Ashwin} } @conference {getoor:asonam12t, title = {Entity Resolution for Social Network Analysis and Mining}, booktitle = {IEEE ACM International Conference on Advances in Social Networks Analysis and Mining}, year = {2012}, note = {URL: http://www.cs.umd.edu/~getoor/Tutorials/ER_ASONAM2012.pdf}, publisher = {IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining}, organization = {IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining}, author = {Lise Getoor and Machanavajjhala, Ashwin} } @conference {memory:ursw12, title = {Graph Summarization in Annotated Data Using Probabilistic Soft Logic}, booktitle = {Proceedings of the International Workshop on Uncertainty Reasoning for the Semantic Web (URSW)}, year = {2012}, author = {Memory, Alex and Kimmig, Angelika and Bach, Stephen H. and Raschid, Louiqa and Lise Getoor} } @conference {chen:wpov11, title = {Active Inference for Retrieval in Camera Networks}, booktitle = {IEEE Workshop on Person-Oriented Vision}, year = {2011}, abstract = {

We address the problem of searching camera network videos to retrieve frames containing specified individuals.We show the benefit of utilizing a learned probabilistic model that captures dependencies among the cameras. Inaddition, we develop an active inference framework that can request human input at inference time, directing human attention to the portions of the videos whose correct annotation would provide the biggest performance improvements. Our primary contribution is to show that by mappingvideo frames in a camera network onto a graphical model,we can apply collective classification and active inference algorithms to significantly increase the performance of the retrieval system, while minimizing the number of human annotations required.

}, author = {Chen Daozheng and Bilgic Mustafa and Lise Getoor and Jacobs David and Mihalkova Lilyana and Yeh Tom} } @conference {sharara:ijcai11, title = {Active Surveying: A Probabilistic Approach for Identifying Key Opinion Leaders}, booktitle = {IJCAI}, year = {2011}, abstract = {

Opinion leaders play an important role in influencingpeople{\textquoteright}s beliefs, actions and behaviors. Althougha number of methods have been proposedfor identifying influentials using secondary sourcesof information, the use of primary sources, suchas surveys, is still favored in many domains. Inthis work we present a new surveying methodwhich combines secondary data with partial knowledgefrom primary sources to guide the informationgathering process. We apply our proposed activesurveying method to the problem of identifying keyopinion leaders in the medical field, and show howwe are able to accurately identify the opinion leaderswhile minimizing the amount of primary datarequired, which results in significant cost reductionin data acquisition without sacrificing its integrity.

}, author = {Sharara Hossam and Lise Getoor and Norton Myra} } @conference {moustafa:gdm11, title = {Declarative Analysis of Noisy Information Networks}, booktitle = {ICDE Workshop on GDM}, year = {2011}, abstract = {

There is a growing interest in methods for analyzing data describing networks of all types, including information, biological, physical, and social networks. Typically the data describing these networks is observational, and thus noisy and incomplete; it is often at the wrong level of fidelity and abstraction for meaningful data analysis. This has resulted in a growing body of work on extracting, cleaning, and annotating network data. Unfortunately, much of this work is ad hoc and domain-specific. In this paper, we present the architecture of a data management system that enables efficient, declarative analysis of large-scale information networks. We identify a set of primitives to support the extraction and inference of a network from observational data, and describe a framework that enables a network analyst to easily implement and combine new extraction and analysis techniques, and efficiently apply them to large observation networks. The key insight behind our approach is to decouple, to the extent possible, (a) the operations that require traversing the graph structure (typically the computationally expensive step), from (b) the operations that do the modification and update of the extracted network. We present an analysis language based on Datalog, and show how to use it to cleanly achieve such decoupling. We briefly describe our prototype system that supports these abstractions. We include a preliminary performance evaluation of the system and show that our approach scales well and can efficiently handle a wide spectrum of data cleaning operations on network data.

}, author = {Moustafa, Walaa and Namata, Galileo and Deshpande, Amol and Lise Getoor} } @article {chen:pami11, title = {Dynamic Processing Allocation in Video}, journal = {PAMI}, volume = {33}, number = {11}, year = {2011}, pages = {2174-2187}, abstract = {

Large stores of digital video pose severe computational challenges to existing video analysis algorithms. In applying these algorithms, users must often trade off processing speed for accuracy, as many sophisticated and effective algorithms require large computational resources that make it impractical to apply them throughout long videos. One can save considerable effort by applying these expensive algorithms sparingly, directing their application using the results of more limited processing. We show how to do this for retrospective video analysis by modeling a video using a chain graphical model and performing inference both to analyze the video and to direct processing. We apply our method to problems in background subtraction and face detection, and show in experiments that this leads to significant improvements over baseline algorithms.

}, author = {Chen Daozheng and Bilgic Mustafa and Lise Getoor and Jacobs David} } @article {getoor:sdm11tutorial, title = {Exploiting Statistical and Relational Information on the Web and in Social Media}, year = {2011}, keywords = {Statistical relational learning, social media, tutorial, web}, author = {Lise Getoor and Mihalkova, Lilyana} } @conference {minton:cmla11, title = {Improving Classifier Performance by Autonomously Collecting Background Knowledge from the Web}, booktitle = {Tenth International Conference on Machine Learning and Applications}, year = {2011}, author = {Minton, Steve and Michelson, Matthew and See, Kane and Macskassy, Sofus and Gazen, Bora C. and Lise Getoor} } @conference {mihalkova:wsdm-wkshop11, title = {Learning to Predict Web Collaborations}, booktitle = {WSDM Workshop on User Modeling for Web Applications}, year = {2011}, author = {Mihalkova, Lilyana and Moustafa, Walaa Eldin and Lise Getoor} } @unpublished {mihalkova:arxive11, title = {Lifted Graphical Models: A Survey}, year = {2011}, note = {Arxiv preprint arXiv:1107.4966v2}, author = {Mihalkova, Lilyana and Lise Getoor} } @article {sharara:snam10, title = {Understanding Actor Loyalty to Event-Based Groups in Affiliation Networks}, journal = {Journal of Advances in Social Networks Analysis and Mining}, volume = {1}, number = {2}, year = {2011}, month = {April}, pages = {115{\textendash}126}, author = {Sharara, Hossam and Singh, Lisa and Lise Getoor and Mann, Janet} } @conference {bilgic:icml10, title = {Active Learning for Networked Data}, booktitle = {Proceedings of the 27th International Conference on Machine Learning (ICML-10)}, year = {2010}, author = {Bilgic, Mustafa and Mihalkova, Lilyana and Lise Getoor} } @conference {broecheler:uai10, title = {Probabilistic Similarity Logic}, booktitle = {Conference on Uncertainty in Artificial Intelligence}, year = {2010}, author = {Broecheler, Matthias and Mihalkova, Lilyana and Lise Getoor} } @conference {zheleva:www10, title = {Statistical Models of Music-listening Sessions in Social Media}, booktitle = {19th International World Wide Web Conference (WWW)}, year = {2010}, author = {Zheleva, Elena and Guiver, John and Mendes Rodrigues, Eduarda and Milic-Frayling, Natasa} } @article {Polymeropoulos:SchizRes09, title = {Common effect of antipsychotics on the biosynthesis and regulation of fatty acids and cholesterol supports a key role of lipid homeostasis in schizophrenia.}, journal = {Schizophrenia Research}, year = {2009}, keywords = {bioinformatics gene expression analysis antipsychotic pharmacogenetics}, author = {Polymeropoulos, Mihales and Licamele, Louis and Volpi, Simona and Mack, Kendra and Mitkus, Shruti and Carstea, Eugene and Lise Getoor and Lavedan, Christian} } @conference {sharara:asonam09, title = {The Dynamics of Actor Loyalty to Groups in Affiliation Networks}, booktitle = {International Conference on Advances in Social Networks Analysis and Mining}, year = {2009}, month = {July}, author = {Sharara, Hossam and Singh, Lisa and Lise Getoor and Mann, Janet} } @book {islamaj:fga-book07, title = {A Feature Generation Algorithm with Applications to Biological Sequence Classification}, series = {Computational Methods of Feature Selection}, volume = {1}, year = {2008}, pages = {355--376}, publisher = {Chapman and Hall/CRC Press}, organization = {Chapman and Hall/CRC Press}, edition = {1}, chapter = {18}, author = {Rezarta Islamaj and Lise Getoor and John Wilbur}, editor = {Huan Liu and Hiroshi Motoda} } @article {dietterich:ml08, title = {Structured machine learning: the next ten years}, journal = {Machine Learning}, volume = {73}, number = {1}, year = {2008}, note = {Full version is available at http://dx.doi.org/10.1007/s10994-008-5079-1}, pages = {3{\textendash}23}, author = {Dietterich, Thomas and Domingos, Pedro and Lise Getoor and Muggleton, Stephen and Tadepalli, Prasad} } @article {islamaj:bmc07, title = {Features generated for computational splice-site prediction correspond to functional elements}, journal = {BMC Bioinformatics}, volume = {8}, number = {410}, year = {2007}, note = {Electronic version is available at http://www.biomedcentral.com/1471-2105/8/410}, month = {October}, keywords = {feature generation, functional biological signals, splice-site}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John and Mount, Stephen} } @unpublished {udrea:iswc07, title = {HOMER: Ontology Alignment Visualization and Analysis}, year = {2007}, author = {Udrea, Octavian and Lise Getoor and Miller, Renee} } @conference {udrea:iswc07-demo, title = {HOMER: Ontology visualization and analysis}, booktitle = {Demo Presentation at International Semantic Web Conference (ISWC)}, year = {2007}, author = {Udrea, Octavian and Miller, Renee and Lise Getoor} } @conference {udrea:sigmod07, title = {Leveraging Data and Structure in Ontology Integration}, booktitle = {Proceedings of ACM-SIGMOD 2007 International Conference on Management}, year = {2007}, pages = {449{\textendash}460}, author = {Udrea, Octavian and Lise Getoor and Miller, Renee} } @article {islamaj:nar07, title = {SplicePort - An interactive splice-site analysis tool}, journal = {Nucleic Acids Research}, year = {2007}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John and Mount, Stephen} } @book {getoor:lbc-book-ch05, title = {Link-based Classification}, series = {Advanced Methods for Knowledge Discovery from Complex Data}, volume = {1}, year = {2005}, pages = {189--207}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, edition = {1}, chapter = {7}, abstract = {

A key challenge for machine learning is the problem of mining richly structured data sets, where the objects are linked in some way due to either an explicit or implicit relationship that exists between the objects. Links among the objects demonstrate certain patterns, which can be helpful for many machine learning tasks and are usually hard to capture with traditional statistical models. Recently there has been a surge of interest in this area, fuelled largely by interest in web and hypertext mining, but also by interest in mining social networks, bibliographic citation data, epidemiological data and other domains best described using a linked or graph structure. In this chapter we propose a framework for modeling link distributions, a link-based model that supports discriminative models describing both the link distributions and the attributes of linked objects. We use a structured logistic regression model, capturing both content and links. We systematically evaluate several variants of our link-based model on a range of data sets including both web and citation collections. In all cases, the use of the link distribution improves classification performance.

}, author = {Lise Getoor}, editor = {Ujjwal Maulik and Lawrence Holder and Diane Cook} } @conference {lerman:sigmod04, title = {Using the Structure of Web Sites for Automatic Segmentation of Tables}, booktitle = {In Proceedings of ACM-SIGMOD 2004 International Conference on Management of Data}, year = {2004}, author = {Lerman, Kristina and Lise Getoor and Minton, Steve and Knoblock, Craig} }