%% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Created for Csaba at 2013-03-29 18:27:53 -0600 %% Saved with string encoding Unicode (UTF-8) @inproceedings{BiSzaSze07, Abstract = {When data is scarce or the alphabet is large, smoothing the probability estimates becomes inescapable when estimating n-gram models. In this paper we propose a method that implements a form of smoothing by exploiting similarity information of the alphabet elements. The idea is to view the log-conditional probability function as a smooth function defined over the similarity graph. The algorithm that we propose uses the eigenvectors of the similarity graph as the basis of the expansion of the log conditional probability function whose coefficients are found by solving a regularized logistic regression problem. The experimental results demonstrate the superiority of the method when the similarity graph contains relevant information, whilst the method still remains competitive with state-of-the-art smoothing methods even in the lack of such information.}, Author = {B{\'\i}r{\'o}, I. and Szamonek, Z. and and Szepesv{\'a}ri, {Cs}.}, Booktitle = {IJCAI}, Date-Added = {2013-03-29 18:12:09 -0600}, Date-Modified = {2013-03-29 18:19:39 -0600}, Keywords = {sequence prediction, natural language processing, spectral graph theory, smoothing, learning basis functions}, Pages = {1576--1581}, Pdf = {papers/sequence-ijcai07.pdf}, Title = {Sequence prediction exploiting similarity information}, Year = {2007}} @inproceedings{AfGyBoSze13, Abstract = {We consider the problem of simultaneously learning to linearly combine a very large number of kernels and learn a good predictor based on the learnt kernel. When the number of kernels d to be combined is very large, multiple kernel learning methods whose computational cost scales linearly in d are intractable. We propose a randomized version of the mirror descent algorithm to overcome this issue, under the objective of minimizing the group p-norm penalized empirical risk. The key to achieve the required exponential speed-up is the computationally efficient construction of low-variance estimates of the gradient. We propose importance sampling based estimates, and find that the ideal distribution samples a coordinate with a probability proportional to the magnitude of the corresponding gradient. We show the surprising result that in the case of learning the coefficients of a polynomial kernel, the combinatorial structure of the base kernels to be combined allows the implementation of sampling from this distribution to run in O(log(d)) time, making the total computational cost of the method to achieve an epsilon-optimal solution to be O(log(d)/epsilon^2), thereby allowing our method to operate for very large values of d. Experiments with simulated and real data confirm that the new algorithm is computationally more efficient than its state-of-the-art alternatives.}, Author = {Afkanpour, A. and Gy{\"o}rgy, A. and Bowling, M. H. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date = {2013-06}, Date-Added = {2013-03-11 21:19:26 -0600}, Date-Modified = {2013-03-11 21:29:23 -0600}, Keywords = {theory, RKHS, kernels, optimization, multikernel learning, stochastic gradient methods, coordinate descent, mirror descent}, Month = {June}, Pdf = {papers/mkl_icml2013.pdf}, Title = {A Randomized Mirror Descent Algorithm for Large Scale Multiple Kernel Learning}, Year = {2013}} @inproceedings{YuChSchSz13, Abstract = {The representer theorem assures that kernel methods retain optimality under penalized empirical risk minimization. While a sufficient condition on the form of the regularizer guaranteeing the representer theorem has been known since the initial development of kernel methods, necessary conditions have only been investigated recently. In this paper we completely characterize the necessary and sufficient conditions on the regularizer that ensure the representer theorem holds. The results are surprisingly simple yet broaden the conditions where the representer theorem is known to hold. Extension to the matrix domain is also addressed. }, Author = {Yu, Y.-L. and Chen, H. and Schuurmans, D. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date = {2013-06}, Date-Added = {2013-03-11 21:05:11 -0600}, Date-Modified = {2013-03-11 21:23:39 -0600}, Keywords = {theory, representer theorem, RKHS, kernels}, Month = {June}, Pdf = {papers/repthm_icml13.pdf}, Title = {Characterizing the Representer Theorem}, Year = {2013}} @incollection{NIPS2012_0424, Abstract = {The task of image auto-annotation, namely assigning a set of relevant tags to an image, is challenging due to the size and variability of tag vocabularies. Consequently, most existing algorithms focus on tag assignment and fix an often large number of hand-crafted features to describe image characteristics. In this paper we introduce a hierarchical model for learning representations of standard sized color images from the pixel level, removing the need for engineered feature representations and subsequent feature selection for annotation. We benchmark our model on the STL-10 recognition dataset, achieving state-of-the-art performance. When our features are combined with TagProp (Guillaumin et al.), we compete with or outperform existing annotation approaches that use over a dozen distinct handcrafted image descriptors. Furthermore, using 256-bit codes and Hamming distance for training TagProp, we exchange only a small reduction in performance for efficient storage and fast comparisons. Self-taught learning is used in all of our experiments and deeper architectures always outperform shallow ones.}, Annote = {The task of image auto-annotation, namely assigning a set of relevant tags to an image, is challenging due to the size and variability of tag vocabularies. Consequently, most existing algorithms focus on tag assignment and fix an often large number of hand-crafted features to describe image characteristics. In this paper we introduce a hierarchical model for learning representations of standard sized color images from the pixel level, removing the need for engineered feature representations and subsequent feature selection for annotation. We benchmark our model on the STL-10 recognition dataset, achieving state-of-the-art performance. When our features are combined with TagProp (Guillaumin et al.), we compete with or outperform existing annotation approaches that use over a dozen distinct handcrafted image descriptors. Furthermore, using 256-bit codes and Hamming distance for training TagProp, we exchange only a small reduction in performance for efficient storage and fast comparisons. Self-taught learning is used in all of our experiments and deeper architectures always outperform shallow ones.}, Author = {Kiros, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date = {2012-12}, Date-Added = {2012-12-04 09:46:13 -0800}, Date-Modified = {2013-03-11 21:32:43 -0600}, Keywords = {deep learning, image processing}, Month = {December}, Pages = {917--925}, Title = {Deep Representations and Codes for Image Auto-Annotation}, Url = {http://books.nips.cc/papers/files/nips25/NIPS2012_0424.pdf}, Year = {2012}, Bdsk-Url-1 = {http://books.nips.cc/papers/files/nips25/NIPS2012_0424.pdf}} @inproceedings{BarSze1206, Abstract = {We present a new anytime algorithm that achieves near-optimal regret for any instance of finite stochastic partial monitoring. In particular, the new algorithm achieves the minimax regret, within logarithmic factors, for both "easy" and "hard" problems. For easy problems, it additionally achieves logarithmic individual regret. Most importantly, the algorithm is adaptive in the sense that if the opponent strategy is in an "easy region" of the strategy space then the regret grows as if the problem was easy. As an implication, we show that under some reasonable additional assumptions, the algorithm enjoys an O(T^{1/2}) regret in Dynamic Pricing, proven to be hard by Bartok et al. (2011).}, Author = {Bart{\'o}k, G. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ALT}, Date = {2012-07}, Date-Added = {2012-07-11 12:44:18 +0200}, Date-Modified = {2012-12-04 09:51:13 -0800}, Keywords = {partial information, online learning, stochastic partial monitoring, theory}, Month = {October}, Pages = {305--319}, Pdf = {papers/adaptive_sideinfo.pdf}, Title = {Partial monitoring with side information}, Year = {2012}} @inproceedings{YaoSze12, Abstract = {In this paper we consider the problem of finding a good policy given some batch data. We propose a new approach, LAM-API, that first builds a so-called linear action model (LAM) from the data and then uses the learned model and the collected data in approximate policy iteration (API) to find a good policy. A natural choice for the policy evaluation step in this algorithm is to use least-squares temporal difference (LSTD) learning algorithm. Empirical results on three benchmark problems show that this particular instance of LAM-API performs competitively as compared with LSPI, both from the point of view of data and computational efficiency.}, Author = {Yao, H. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {AAAI-2012}, Date = {2012-07}, Date-Added = {2012-06-06 14:33:03 -0600}, Date-Modified = {2012-06-06 21:32:40 -0600}, Keywords = {reinforcement learning, Markov Decision Processes,function approximation, control, planning, control learning, temporal difference learning, LSTD}, Month = {July}, Pdf = {papers/lamapi.pdf}, Title = {Approximate Policy Iteration with Linear Action Models}, Year = {2012}} @incollection{Sze12, Abstract = {The problem of estimating the value function underlying a Markovian reward process is considered. As it is well known, the value function underlying a Markovian reward process satisfied a linear fixed point equation. One approach to learning the value function from finite data is to find a good approximation to the value function in a given (linear) subspace of the space of value functions. We review some of the issues that arise when following this approach, as well as some results that characterize the finite-sample performance of some of the algorithms.}, Address = {Oberwolfach}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {Mini-Workshop: Mathematics of Machine Learning}, Date = {2011-08}, Date-Added = {2012-06-03 14:57:46 -0600}, Date-Modified = {2013-03-11 21:17:16 -0600}, Keywords = {reinforcement learning, linear prediction, theory, inverse problems, LSTD}, Number = {3}, Pages = {2385--2388}, Pdf = {papers/Oberwolfach-Report.pdf}, Publisher = {European Mathematical Society}, Series = {Oberwolfach Reports}, Title = {Least Squares Temporal Difference Learning and {G}alerkin's Method}, Volume = {8}, Year = {2011}} @inproceedings{PiSze1206, Abstract = {Motivated by value function estimation in reinforcement learning, we study statistical linear inverse problems, i.e., problems where the coefficients of a linear system to be solved are observed in noise. We consider penalized estimators, where performance is evaluated using a matrix-weighted two-norm of the defect of the estimator measured with respect to the true, unknown coefficients. Two objective functions are considered de- pending whether the error of the defect measured with respect to the noisy coefficients is squared or unsquared. We propose simple, yet novel and theoretically well-founded data-dependent choices for the regularization parameters for both cases that avoid data-splitting. A distinguishing feature of our analysis is that we derive deterministic error bounds in terms of the error of the coefficients, thus allowing the complete separation of the analysis of the stochastic properties of these errors. We show that our results lead to new insights and bounds for linear value function estimation in reinforcement learning.}, Author = {Pires, B.A. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date = {2012-06}, Date-Added = {2012-06-03 14:48:14 -0600}, Date-Modified = {2012-06-06 21:34:07 -0600}, Keywords = {theory, linear prediction, reinforcement learning, LSTD, performance bounds, theory, inverse problems}, Month = {June}, Pdf = {papers/linear_estimation_icml2012_extended.pdf}, Title = {Statistical linear estimation with penalized estimators: an application to reinforcement learning}, Year = {2012}} @inproceedings{YuSze1206, Abstract = {In real supervised learning scenarios, it is not uncommon that the training and test sample follow different probability distributions, thus rendering the necessity to correct the sampling bias. Focusing on a particular co- variate shift problem, we derive high probability confidence bounds for the kernel mean matching (KMM) estimator, whose convergence rate turns out to depend on some regularity measure of the regression function and also on some capacity measure of the kernel. By comparing KMM with the natural plug-in estimator, we establish the superiority of the former hence provide concrete evidence/understanding to the effectiveness of KMM under covariate shift.}, Author = {Yu, Y.-L. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date = {2012-06}, Date-Added = {2012-06-03 14:46:56 -0600}, Date-Modified = {2013-03-11 21:15:36 -0600}, Keywords = {theory, nonparametrics, covariate shift}, Month = {June}, Pdf = {papers/covshiftkmm.pdf}, Title = {Analysis of Kernel Mean Matching under Covariate Shift}, Year = {2012}} @inproceedings{BarZolSze1206, Abstract = {We present a new anytime algorithm that achieves near-optimal regret for any instance of finite stochastic partial monitoring. In particular, the new algorithm achieves the minimax regret, within logarithmic factors, for both "easy" and "hard" problems. For easy problems, it additionally achieves logarithmic individual regret. Most importantly, the algorithm is adaptive in the sense that if the opponent strategy is in an "easy region" of the strategy space then the regret grows as if the problem was easy. As an implication, we show that under some reasonable additional assumptions, the algorithm enjoys an O(T^{1/2}) regret in Dynamic Pricing, proven to be hard by Bartok et al. (2011).}, Author = {Bart{\'o}k, G. and Zolghadr, N. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date = {2012-06}, Date-Added = {2012-06-03 14:44:57 -0600}, Date-Modified = {2012-06-06 21:33:10 -0600}, Keywords = {partial information, online learning, stochastic partial monitoring, minimax bounds, theory}, Month = {June}, Pages = {1--20}, Pdf = {papers/adaptive_partmon_full.pdf}, Title = {An adaptive algorithm for finite stochastic partial monitoring (extended version)}, Year = {2012}} @article{AfGySzBo12, Abstract = {We consider the problem of learning a predictor by combining possibly infinitely many linear predictors whose weights are to be learned, too, an instance of multiple kernel learning. To control overfitting a group p-norm penalty is used to penalize the empirical loss. We consider a reformulation of the problem that lets us implement a randomized version of the proximal point algorithm. The key idea of the new algorithm is to use randomized computation to alleviate the problem of dealing with possibly uncountably many predictors. Finite-time performance bounds are derived that show that under mild conditions the method finds the optimum of the penalized criterion in an efficient manner. Experimental results confirm the effectiveness of the new algorithm.}, Author = {Afkanpour, A. and Gy{\"o}rgy, A. and Szepesv{\'a}ri, {Cs}. and Bowling, M. H.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Date = {2012-05}, Date-Added = {2012-06-03 14:42:11 -0600}, Date-Modified = {2012-06-06 21:24:56 -0600}, Ee = {http://arxiv.org/abs/1205.0288}, Journal = {CoRR}, Keywords = {multikernel learning, optimization, theory}, Month = {May}, Pdf = {papers/randomized-mkl.pdf}, Title = {A Randomized Strategy for Learning to Combine Many Features}, Volume = {abs/1205.0288}, Year = {2012}} @article{GKSSSST12, Abstract = {The ancient oriental game of Go has long been considered a grand challenge for artificial intelligence. For decades, computer Go has defied the classical methods in game tree search that worked so successfully for chess and checkers. How- ever, recent play in computer Go has been transformed by a new paradigm for tree search based on Monte-Carlo methods. Programs based on Monte-Carlo tree search now play at human-master levels and are beginning to challenge top professional players. In this paper we describe the leading algorithms for Monte-Carlo tree search and explain how they have advanced the state of the art in computer Go.}, Author = {Gelly, S. and Kocsis, L. and Schoenauer, M. and Sebag, M. and Silver, D. and Szepesv{\'a}ri, {Cs}. and Teytaud, O.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Date = {2012-03}, Date-Added = {2012-06-03 14:31:08 -0600}, Date-Modified = {2012-06-06 21:29:23 -0600}, Ee = {http://doi.acm.org/10.1145/2093548.2093574}, Journal = {Communications of the {ACM}}, Keywords = {Monte-Carlo tree search, UCT, Game of Go}, Number = {3}, Pages = {106--113}, Pdf = {papers/CACM-MCTS.pdf}, Title = {The grand challenge of computer {G}o: {M}onte {C}arlo tree search and extensions}, Volume = {55}, Year = {2012}} @inproceedings{AYPSze12, Abstract = {We introduce a novel technique, which we call online-to-confidence-set conversion. The technique allows us to construct high-probability confidence sets for linear prediction with correlated inputs given the predictions of any algorithm (e.g., online LASSO, exponentiated gradient algorithm, online least-squares, p-norm algorithm) targeting online learning with linear predictors and the quadratic loss. By construction, the size of the confidence set is directly governed by the regret of the online learning algorithm. Constructing tight confidence sets is interesting on its own, but the new technique is given extra weight by the fact having access tight confidence sets underlies a number of important problems. The advantage of our construction here is that progress in constructing better algorithms for online prediction problems directly translates into tighter confidence sets. In this paper, this is demonstrated in the case of linear stochastic bandits. In particular, we introduce the sparse variant of linear stochastic bandits and show that a recent online algorithm together with our online-to-confidence-set conversion allows one to derive algorithms that can exploit if the reward is a function of a sparse linear combination of the components of the chosen action.}, Author = {Abbasi-Yadkori, Y. and P{\'a}l, D. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {AISTAT}, Date = {2012-04}, Date-Added = {2012-06-03 14:18:30 -0600}, Date-Modified = {2012-06-06 21:28:32 -0600}, Ee = {http://jmlr.csail.mit.edu/proceedings/papers/v22/abbasi-yadkori12/abbasi-yadkori12.pdf}, Keywords = {bandits, stochastic bandits, theory, online learning, linear bandits}, Pages = {1--9}, Pdf = {papers/online-to-confidenceset.pdf}, Title = {Online-to-Confidence-Set Conversions and Application to Sparse Stochastic Bandits}, Year = {2012}} @inproceedings{NeGySz12, Abstract = {We consider online learning in a special class of episodic Markovian decision processes, namely, loop-free stochastic shortest path problems. In this problem, an agent has to traverse through a finite directed acyclic graph with random transitions while maximizing the obtained rewards along the way. We assume that the reward function can change arbitrarily between consecutive episodes, and is entirely revealed to the agent at the end of each episode. Previous work was concerned with the case when the stochastic dynamics is known ahead of time, whereas the main novelty of this paper is that this assumption is lifted. We propose an algorithm called "follow the perturbed optimistic policy" that combines ideas from the "follow the perturbed leader" method for online learning of arbitrary sequences and "upper confidence reinforcement learning", an algorithm for regret minimization in Markovian decision processes (with a fixed reward function). We prove that the expected cumulative regret of our algorithm is of order L|X| |A| T^{1/2} up to logarithmic factors, where L is the length of the longest path in the graph, X is the state space, A is the action space and T is the number of episodes. To our knowledge this is the first algorithm that learns and controls stochastic and adversarial components in an online fashion at the same time.}, Author = {Neu, G. and Gy{\"o}rgy, A. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {AISTAT}, Date = {2012-04}, Date-Added = {2012-06-03 14:16:50 -0600}, Date-Modified = {2012-06-06 21:26:47 -0600}, Ee = {http://jmlr.csail.mit.edu/proceedings/papers/v22/neu12.html}, Keywords = {online learning, adversarial setting, finite MDPs, shortest path problem, theory}, Month = {April}, Pages = {805--813}, Pdf = {papers/neu12.pdf}, Title = {The adversarial stochastic shortest path problem with unknown transition probabilities}, Year = {2012}} @article{AnBaSze11, Abstract = {We consider online learning in partial-monitoring games against an oblivious adversary. We show that when the number of actions available to the learner is two and the game is nontrivial then it is reducible to a bandit-like game and thus the minimax regret is Theta(T^{1/2}).}, Author = {Antos, A. and Bart{\'o}k, G. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Date = {2012-06}, Date-Added = {2012-06-03 14:04:57 -0600}, Date-Modified = {2012-06-06 21:29:55 -0600}, Ee = {http://arxiv.org/abs/1108.4961}, Journal = {CoRR}, Keywords = {minimax bounds, bandits, partial information, online learning, theory}, Pdf = {papers/twoarmed.pdf}, Title = {Non-trivial two-armed partial-monitoring games are bandits}, Volume = {abs/1108.4961}, Year = {2011}} @proceedings{KiSzeUkZeu11, Address = {Espoo, Finland}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {Proceedings of Algorithmic Learning Theory - 22nd International Conference ({ALT} 2011)}, Date = {2012-06}, Date-Added = {2012-06-03 14:02:13 -0600}, Date-Modified = {2012-06-03 14:27:12 -0600}, Editor = {Kivinen, J. and {Sz}epesv{\'a}ri, {Cs}. and Ukkonen, E. and Zeugmann, T.}, Ee = {http://dx.doi.org/10.1007/978-3-642-24412-4}, Isbn = {978-3-642-24411-7}, Keywords = {learning theory}, Month = {October}, Publisher = {Springer}, Series = {Lecture Notes in Computer Science}, Title = {Proceedings of Algorithmic Learning Theory - 22nd International Conference ({ALT} 2011)}, Volume = {6925}, Year = {2011}} @inproceedings{AYPSz11, Abstract = {We improve the theoretical analysis and empirical performance of algorithms for the stochastic multi-armed bandit problem and the linear stochastic multi-armed bandit problem. In particular, we show that a simple modification of Auer's UCB algorithm (Auer, 2002) achieves with high probability constant regret. More importantly, we modify and, consequently, improve the analysis of the algorithm for the for linear stochastic bandit problem studied by Auer (2002), Dani et al. (2008), Rusmevichientong and Tsitsiklis (2010), Li et al. (2010). Our modification improves the regret bound by a logarithmic factor, though experiments show a vast improvement. In both cases, the improvement stems from the construction of smaller confidence sets. For their construction we use a novel tail inequality for vector-valued martingales.}, Author = {Abbasi-Yadkori, Y. and P{\'a}l, D. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date = {2011-09}, Date-Added = {2011-09-17 17:51:09 -0600}, Date-Modified = {2012-06-03 14:39:47 -0600}, Keywords = {bandits, stochastic bandits, theory, linear bandits, online learning}, Month = {December}, Pages = {2312--2320}, Pdf = {papers/linear-bandits-NIPS2011.pdf}, Title = {Improved Algorithms for Linear Stochastic Bandits (extended version)}, Year = {2011}} @article{FaSze11, Abstract = {We analyze the rate of convergence of the estimation error in regularized least-squares regression when the data is exponentially beta-mixing. The results are proven under the assumption that the metric entropy of the balls in the chosen function space grows at most polynomially. In order to prove our main result, we also derive a relative deviation concentration inequality for beta-mixing processes, which might be of independent interest. The other major techniques that we use are the independent-blocks technique and the peeling device. An interesting aspect of our analysis is that in order to obtain fast rates we have to make the block sizes dependent on the layer of peeling. With this approach, up to a logarithmic factor, we recover the optimal minimax rates available for the i.i.d. case. In particular, our rate asymptotically matches the optimal rate of convergence when the regression function belongs to a Sobolev space.}, Author = {Farahmand, A.{m}. and Szepesv{\'a}ri, {Cs}.}, Date = {2011-08}, Date-Added = {2011-08-09 16:51:54 -0600}, Date-Modified = {2012-06-03 14:14:18 -0600}, Journal = {Journal of Statistical Planning and Inference}, Keywords = {theory, nonparametrics, regression, mixing}, Month = {February}, Number = {2}, Pages = {493--505}, Pdf = {papers/RegularizedRegressionWithMixing.pdf}, Title = {Regularized Least-Squares Regression: Learning from a beta-mixing Sequence}, Volume = {142}, Year = {2012}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8}} @article{anszemu:mlj07, Abstract = { In this paper we consider the problem of finding a near-optimal policy in a continuous space, discounted Markovian Decision Problem (MDP) by employing value-function-based methods when only a single trajectory of a fixed policy is available as the input. We study a policy-iteration algo- rithm where the iterates are obtained via empirical risk minimization with a risk function that penalizes high magnitudes of the Bellman-residual. Our main result is a finite-sample, high-probability bound on the performance of the computed policy that depends on the mixing rate of the trajectory, the capacity of the function set as measured by a novel capacity concept (the VC-crossing dimension), the approximation power of the function set and the controllability properties of the MDP. Moreover, we prove that when a linear parameterization is used the new algorithm is equivalent to Least-Squares Policy Iteration. To the best of our knowledge this is the first theoretical result for off-policy control learning over continuous state-spaces using a single trajectory.}, Author = {Antos, A. and Szepesv{\'a}ri, {Cs}. and Munos, R.}, Date-Added = {2011-07-26 09:54:32 -0600}, Date-Modified = {2011-07-26 10:03:59 -0600}, Doi = {10.1007/s10994-007-5038-2}, Editor = {H.U. Simon, G. Lugosi, A. Blum}, Journal = {Machine Learning}, Keywords = {reinforcement learning, nonparametrics, theory, function approximation, policy iteration}, Month = apr, Note = {Published Online First: 14 Nov, 2007}, Number = {1}, Pages = {89--129}, Pdf = {papers/sapi_mlj.pdf}, Title = {Learning near-optimal policies with {B}ellman-residual minimization based fitted policy iteration and a single sample path}, Url = {http://www.springerlink.com/content/h46m1152288681jt/}, Url2 = {http://springer.r.delivery.net/r/r?2.1.Ee.2Tp.1gRdFJ.BwJPwi..N.ErIo.2yDO.LBOEfA00}, Url3 = {http://www.szit.bme.hu/~antos/ps/anszmu_sapi_mlj.ps.gz}, Volume = {71}, Year = {2008}, Bdsk-Url-1 = {http://www.springerlink.com/content/h46m1152288681jt/}} @inproceedings{FaSzePi11, Abstract = {Bayesian priors offer a compact yet general means of incorporating domain knowledge into many learning tasks. The correctness of the Bayesian analysis and inference, however, largely depends on accuracy and correctness of these priors. PAC-Bayesian methods over- come this problem by providing bounds that hold regardless of the correctness of the prior distribution. This paper introduces the first PAC-Bayesian bound for the batch reinforce- ment learning problem with function approx- imation. We show how this bound can be used to perform model-selection in a trans- fer learning scenario. Our empirical results confirm that PAC-Bayesian policy evaluation is able to leverage prior distributions when they are informative and, unlike standard Bayesian RL approaches, ignore them when they are misleading.}, Author = {Fard, M.M. and Pineau, J. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {UAI}, Crossref = {UAI2011}, Date = {2011-07}, Date-Added = {2011-07-03 21:16:14 -0600}, Date-Modified = {2012-06-03 14:11:41 -0600}, Keywords = {reinforcement learning, theory, function approximation, transfer learning}, Month = {July}, Pages = {195--202}, Pdf = {papers/PacBayesUAI.pdf}, Title = {{PAC}-Bayesian Policy Evaluation for Reinforcement Learning}, Year = {2011}} @article{farahmand2010, Abstract = {(This version is identical to the MLJ version except that in the proof of Theorem 2 a minor issue in the proof is corrected.) We consider the problem of model selection in the batch (offline, non-interactive) rein- forcement learning setting when the goal is to find an action-value function with the smallest Bellman error among a countable set of candidates functions. We propose a complexity regularization-based model selection algorithm, BErMin, and prove that it enjoys an oracle-like property: the estimator's error differs from that of an oracle, who selects the candidate with the minimum Bell- man error, by only a constant factor and a small remainder term that vanishes at a parametric rate as the number of samples increases. As an application, we consider a problem when the true action-value function belongs to an unknown member of a nested sequence of function spaces. We show that under some additional technical conditions BErMin leads to a procedure whose rate of convergence, up to a constant factor, matches that of an oracle who knows which of the nested function spaces the true action-value function belongs to, i.e., the procedure achieves adaptivity.}, Author = {Farahmand, A.{m}. and Szepesv{\'a}ri, {Cs}.}, Date = {2011-07}, Date-Added = {2011-07-03 21:08:30 -0600}, Date-Modified = {2012-06-03 14:11:12 -0600}, Doi = {10.1007/s10994-011-5254-7}, Journal = {Machine Learning Journal}, Keywords = {model selection, Markov Decision Processes, reinforcement learning, Bellman residuals, theory, nonparametrics, adaptivity}, Month = {December}, Number = {3}, Pages = {299--332}, Pdf = {papers/RLModelSelect.pdf}, Title = {Model Selection in Reinforcement Learning}, Volume = {85}, Year = {2011}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8}} @article{AYPaSze11, Abstract = {The analysis of online least squares estimation is at the heart of many stochastic sequential decision making problems. We employ tools from the self-normalized processes to provide a simple and self-contained proof of a tail bound of a vector-valued martingale. We use the bound to construct a new tighter confidence sets for the least squares estimate. We apply the confidence sets to several online decision problems, such as the multi-armed and the linearly parametrized bandit problems. The confidence sets are potentially applicable to other problems such as sleeping bandits, generalized linear bandits, and other linear control problems. We improve the regret bound of the Upper Confidence Bound (UCB) algorithm of Auer et al. (2002) and show that its regret is with high-probability a problem dependent constant. In the case of linear bandits (Dani et al., 2008), we improve the problem dependent bound in the dimension and number of time steps. Furthermore, as opposed to the previous result, we prove that our bound holds for small sample sizes, and at the same time the worst case bound is improved by a logarithmic factor and the constant is improved. }, Author = {Abbasi-Yadkori, Y. and P{\'a}l, D. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Date = {2011-07}, Date-Added = {2011-07-03 21:06:04 -0600}, Date-Modified = {2012-06-03 14:12:44 -0600}, Ee = {http://arxiv.org/abs/1102.2670}, Journal = {CoRR}, Keywords = {bandits, theory, tail inequalities, method of mixtures, least-squares methods}, Month = {July}, Title = {Online Least Squares Estimation with Self-Normalized Processes: An Application to Bandit Problems}, Volume = {abs/1102.2670}, Year = {2011}} @inproceedings{SziSze11, Abstract = {A popular approach in reinforcement learning is to use a model-based algorithm, i.e., an algorithm that utilizes a model learner to learn an approximate model to the environment. It has been shown such a model-based learner is efficient if the model learner is efficient in the so-called ``knows what it knows'' (KWIK) framework. A major limitation of the standard KWIK framework is that, by its very definition, it covers only the case when the (model) learner can represent the actual environment with no errors. In this paper, we introduce the agnostic KWIK learning model, where we relax this assumption by allowing nonzero approximation errors. We show that with the new definition that an efficient model learner still leads to an efficient reinforcement learning algorithm. At the same time, though, we find that learning within the new framework can be substantially slower as compared to the standard framework, even in the case of simple learning problems.}, Author = {Szita, I. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {COLT}, Date = {2011-07}, Date-Added = {2011-07-03 20:59:06 -0600}, Date-Modified = {2012-06-03 14:10:27 -0600}, Keywords = {reinforcement learning, model selection, complexity regularization, adaptivity, offline learning, off-policy learning, finite-sample bounds, theory}, Month = {July}, Pages = {739--772}, Pdf = {papers/agnosticKwik.pdf}, Title = {Agnostic {KWIK} learning and efficient approximate reinforcement learning}, Year = {2011}} @inproceedings{BaPaSze11, Abstract = {In a partial monitoring game, the learner repeatedly chooses an action, the environment responds with an outcome, and then the learner suffers a loss and receives a feedback signal, both of which are fixed functions of the action and the outcome. The goal of the learner is to minimize his regret, which is the difference between his total cumulative loss and the total loss of the best fixed action in hindsight. Assuming that the outcomes are generated in an i.i.d. fashion from an arbitrary and unknown probability distribution, we characterize the minimax regret of any partial monitoring game with finitely many actions and outcomes. It turns out that the minimax regret of any such game is either zero, Theta(T^{1/2}), Theta(T^{2/3}), or Theta(T). We provide a computationally efficient learning algorithm that achieves the minimax regret within logarithmic factor for any game.}, Author = {Bart{\'o}k, G. and P{\'a}l, D. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {COLT}, Date = {2011-07}, Date-Added = {2011-07-03 20:57:58 -0600}, Date-Modified = {2012-06-03 14:10:11 -0600}, Keywords = {online learning, partial information, theory, game theory}, Month = {July}, Pages = {133--154}, Pdf = {papers/partmon_colt_final.pdf}, Title = {Minimax Regret of Finite Partial-Monitoring Games in Stochastic Environments}, Year = {2011}} @inproceedings{AYSze11, Abstract = {We study the average cost Linear Quadratic (LQ) control problem with unknown model parameters, also known as the adaptive control problem in the control community. We design an algorithm and prove that apart from logarithmic factors its regret up to time T is O(T^{1/2}). Unlike previous approaches that use a forced-exploration scheme, we construct a high-probability confidence set around the model parameters and design an algorithm that plays optimistically with respect to this confidence set. The construction of the confidence set is based on the recent results from online least-squares estimation and leads to improved worst-case regret bound for the proposed algorithm. To the best of our knowledge this is the first time that a regret bound is derived for the LQ control problem.}, Author = {Abbasi-Yadkori, Y. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {COLT}, Date = {2011-07}, Date-Added = {2011-07-03 20:55:43 -0600}, Date-Modified = {2012-06-06 21:37:09 -0600}, Keywords = {online learning, continuous-space MDPs, continuous action space, reinforcement learning, linear dynamics, theory}, Month = {July}, Pages = {1--26}, Pdf = {papers/lqr_colt_final.pdf}, Title = {Regret Bounds for the Adaptive Control of Linear Quadratic Systems}, Year = {2011}} @inproceedings{PaZheSze11, Abstract = {We consider the problem of optimally assigning p sniffers to K channels to monitor the transmission activities in a multi-channel wireless network. The activity of users is initially unknown to the sniffers and is to be learned along with channel assignment decisions while maximizing the benifits of this assignment, resulting in the fundamental trade-off between exploration versus exploitation. We formulate it as the linear partial monitoring problem, a super-class of multi-armed bandits. As the number of arms (sniffer-channel assignments) is exponential, novel techniques are called for, to allow efficient learning. We use the linear bandit model to capture the dependency amongst the arms and develop two policies that take advantage of this dependency. Both policies enjoy logarithmic regret bound of time-slots with a term that is sub-linear in the number of arms.}, Author = {Pallavi, A. and Zheng, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {INFOCOM}, Date = {2011-04}, Date-Added = {2010-11-25 01:20:01 -0700}, Date-Modified = {2012-06-03 14:12:25 -0600}, Keywords = {theory, networking, wireless networks, bandits, stochastic bandits}, Month = {April}, Pages = {1152--1160}, Pdf = {papers/infocom11_final.pdf}, Title = {Sequential Learning for Optimal Monitoring of Multi-channel Wireless Networks}, Year = {2011}} @inproceedings{FaMuSz10, Abstract = {We address the question of how the approximation error/Bellman residual at each iteration of the Approximate Policy/Value Iteration algorithms influences the quality of the resulted policy. We quantify the performance loss as the L^p norm of the approximation error/Bellman residual at each iteration. Moreover, we show that the performance loss depends on the expectation of the squared Radon-Nikodym derivative of a certain distribution rather than its supremum -- as opposed to what has been suggested by the previous results. Also our results indicate that the contribution of the approximation/Bellman error to the performance loss is more prominent in the later iterations of API/AVI, and the effect of an error term in the earlier iterations decays exponentially fast.}, Author = {Farahmand, A.{m}. and Munos, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date = {2010-12}, Date-Added = {2010-11-06 19:04:43 -0600}, Date-Modified = {2010-11-25 00:51:19 -0700}, Keywords = {reinforcement learning, theory}, Month = {December}, Pdf = {papers/ErrorPropAPVI-NIPS09.pdf}, Title = {Error Propagation for Approximate Policy and Value Iteration (extended version)}, Year = {2010}} @inproceedings{PaPoSze10, Abstract = {We present simple and computationally efficient nonparametric estimators of R{\'e}nyi entropy and mutual information based on an i.i.d. sample drawn from an unknown, absolutely continuous distribution over R^d. The estimators are calculated as the sum of p-th powers of the Euclidean lengths of the edges of the `generalized nearest-neighbor' graph of the sample and the empirical copula of the sample respectively. For the first time, we prove the almost sure consistency of these estimators and upper bounds on their rates of convergence, the latter of which under the assumption that the density underlying the sample is Lipschitz continuous. Experiments demonstrate their usefulness in independent subspace analysis. }, Author = {P{\'a}l, D. and P{\'o}czos, B. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date = {2010-12}, Date-Added = {2010-11-06 19:04:30 -0600}, Date-Modified = {2010-11-25 00:51:16 -0700}, Keywords = {information theory, theory, mutual information, entropy}, Month = {December}, Pdf = {papers/Renyi-NIPS2010.pdf}, Title = {Estimation of {R}{\'e}nyi Entropy and Mutual Information Based on Generalized Nearest-Neighbor Graphs (extended version)}, Year = {2010}} @inproceedings{FiOlGaSze10, Abstract = { We consider structured multi-armed bandit problems based on the Generalized Linear Model (GLM) framework of statistics. For these bandits, we propose a new algorithm, called GLM-UCB. We derive finite time, high probability bounds on the regret of the algorithm, extending previous analyses developed for the linear bandits to the non-linear case. The analysis highlights a key difficulty in generalizing linear bandit algorithms to the non-linear case, which is solved in GLM-UCB by focusing on the reward space rather than on the parameter space. Moreover, as the actual effectiveness of current parameterized bandit algorithms is often poor in practice, we provide a tuning method based on asymptotic arguments, which leads to significantly better practical performance. We present two numerical experiments on real-world data that illustrate the potential of the GLM-UCB approach. }, Author = {Filippi, S. and Capp{\'e}, O. and Garivier, A. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date = {2010-12}, Date-Added = {2010-11-06 19:04:13 -0600}, Date-Modified = {2012-06-04 10:35:00 -0600}, Keywords = {bandits, stochastic bandits, theory}, Month = {December}, Pages = {586--594}, Pdf = {papers/GenLinBandits-NIPS2010.pdf}, Title = {Parametric Bandits: The Generalized Linear Case (extended version)}, Year = {2010}} @inproceedings{NeGyAnSze10, Abstract = {We consider online learning in finite stochastic Markovian environments where in each time step a new reward function is chosen by an oblivious adversary. The goal of the learning agent is to compete with the best stationary policy in terms of the total reward received. In each time step the agent observes the current state and the reward associated with the last transition, however, the agent does not observe the rewards associated with other state-action pairs. The agent is assumed to know the transition probabilities. The state of the art result for this setting is a no-regret algorithm. In this paper we propose a new learning algorithm and, assuming that stationary policies mix uniformly fast, we show that after T time steps, the expected regret of the new algorithm is O( T^{2/3} (ln T)^(1/3), giving the first rigorously proved regret bound for the problem. }, Author = {Neu, G. and Gy{\"o}rgy, A. and Antos, A. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date = {2010-12}, Date-Added = {2010-11-06 19:03:56 -0600}, Date-Modified = {2010-11-25 00:51:28 -0700}, Keywords = {online learning, Markov Decision Processes, bandits}, Month = {December}, Pdf = {papers/OnlineMDP-NIPS2010.pdf}, Title = {Online {M}arkov Decision Processes under Bandit Feedback (extended version)}, Year = {2010}} @inproceedings{balogh2000a, Abstract = {FlexVoice, an integrated text-to-speech (TTS) system is presented in this paper. Its most distinctive feature is its low memory and CPU load while preserving the high quality of leading TTS systems. FlexVoice uses a hybrid approach that combines diphone concatenation with LPC-based parametric synthesis. Major improvements of speech quality are achieved by the careful design of each module at all synthesis levels (such as selection of training data for the various machine learning methods and that of the basic synthesis units for the parametric synthesiser). FlexVoice currently supports US English with two male and two female voices. }, Author = {Balogh, {Gy}. and Dobler, E. and Gr{\Ho}bler, T. and Smodics, B. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {Text, Speech and Dialogue}, Date-Added = {2010-09-04 12:17:25 -0600}, Date-Modified = {2010-09-04 12:19:26 -0600}, Doi = {10.1007/3-540-45323-7_32}, Keywords = {application, text-to-speech synthesis}, Pages = {119--172}, Pdf = {papers/flexvoice-tsd.pdf}, Publisher = {Springer}, Title = {{F}lex{V}oice: a Parametric Approach to High-quality Speech-synthesis}, Volume = {1902}, Year = {2000}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-45323-7_32}} @article{sor2000, Abstract = {Objective: Development of a fully automated computer application for detection and classification of clustered microcalcifications using neural nets. Material and Methods: Mammographic films with clustered microcalcifications of known histology were digitized. All clusters were rated by two radiologists on a 3 point scale: benign, indeterminate and malignant. Automated detected clustered microcalcifications were clustered. Features derived from those clusters were used as input to 2 artificial neural nets: one was trained to identify the indeterminate clusters, whereas the second ANN classified the remaining clusters in benign or malignant ones. Performance evaluation followed the patient-based receiver operator characteristic analysis. Results: For identification of patients with indeterminate clusters a an Az-value of 0.8741 could be achieved. For the remaining patients their clusters could be classified as benign or malignant at an Az-value of 0.8749, a sensitivity of 0.977 and specificity of 0.471. Conclusions: A fully automated computer system for detection and classification of clustered microcalcifications was developed. The system is able to identify patients with indeterminate clusters, where additional investigations are recommended, and produces a reliable estimation of the biologic dignity for the remaining ones.}, Author = {Sorantin, E. and Schmidt, F. and Mayer, H. and Becker, M. and Szepesv{\'a}ri, {Cs}. and Graif, E. and Winkler, P.}, Date-Added = {2010-09-02 10:15:27 -0600}, Date-Modified = {2010-09-02 13:09:59 -0600}, Journal = {Journal of Computing and Information Technology}, Keywords = {application, neural networks, image processing, health informatics, clinical decision support}, Number = {2}, Pdf = {papers/JCIT2000.pdf}, Title = {Computer Aided Diagnosis of Clustered Microcalcifications Using Artificial Neural Nets}, Url = {http://cit.srce.hr/index.php/CIT/article/view/1415}, Volume = {8}, Year = {2000}, Bdsk-Url-1 = {http://cit.srce.hr/index.php/CIT/article/view/1415}} @article{Schmidt99, Abstract = {We investigated a method for a fully automatic identification and interpretation process for clustered microcalcifications in mammograms. Mammographic films of 100 patients containing microcalcifications with known histology were digitized and preprocessed using standard techniques. Microcalcifications detected by an artificial neural network (ANN) were clustered and some cluster features served as the input of another ANN trained to differentiate between typical and atypical clusters, while others were fed into an ANN trained on typical clusters to evaluate these lesions. The measured sensitivity for the detection of grouped microcalcifications was 0.98. For the task of differentiation between typical and atypical clusters an Az value of 0.87 was computed, while for the diagnosis an Az value of 0.87 with a sensitivity of 0.97 and a specificity of 0.47 was obtained. The results show that a fully automatic computer system was developed for the identification and interpretation of clustered microcalcifications in mammograms with the ability to differentiate most benign lesions from malignant ones in an automatically selected subset of cases.}, Author = {Schmidt, F. and Sorantin, E. and Szepesv{\'a}ri, {Cs}. and Graif, E. and Becker, M. and Mayer, H. and Hartwagner, K.}, Date-Added = {2010-09-02 09:29:13 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Journal = {Physics in Medicine and Biology}, Keywords = {application, neural networks, image processing, health informatics, clinical decision support}, Number = {5}, Pages = {1231--1243}, Pdf = {papers/schmidt-99.pdf}, Title = {An Automatic Method for the Identification and Interpretation of Clustered Microcalcifications in Mammograms}, Url = {http://stacks.iop.org/0031-9155/44/i=5/a=011}, Volume = {44}, Year = {1999}, Bdsk-Url-1 = {http://stacks.iop.org/0031-9155/44/i=5/a=011}} @inproceedings{antos2007, Abstract = {We consider continuous state, continuous action batch reinforcement learning where the goal is to learn a good policy from a sufficiently rich trajectory generated by some policy. We study a variant of fitted Q-iteration, where the greedy action selection is replaced by searching for a policy in a restricted set of candidate policies by maximizing the average action values. We provide a rigorous analysis of this algorithm, proving what we believe is the first finite-time bound for value-function based algorithms for continuous state and action problems. Note: In retrospect, it would have been better to call this algorithm an actor-critic algorithm. The algorithm that we considers updates a policy and a value function (action-value function in this case).}, Author = {Antos, A. and Munos, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Crossref = {NIPS20}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:53 -0700}, Keywords = {batch learning, reinforcement learning, function approximation, performance bounds, actor-critic methods, nonparametrics}, Pages = {9--16}, Pdf = {papers/rlca.pdf}, Title = {Fitted {Q}-iteration in Continuous Action-space {MDP}s}, Year = {2007}} @inproceedings{bartok2010, Abstract = {In a finite partial-monitoring game against Nature, the Learner repeatedly chooses one of finitely many actions, the Nature responds with one of finitely many outcomes, the Learner suffers a loss and receives feedback signal, both of which are fixed functions of the action and the outcome. The goal of the Learner is to minimize its total cumulative loss. We make progress towards classification of these games based on their minimax expected regret. Namely, we classify almost all games with two outcomes: We show that their minimax expected regret is either zero, $\Theta(T^{1/2})$, $\Theta(T^{2/3})$, or $\Theta(T)$ and we give a simple and efficiently computable classification of these four classes of games. Our hope is that the result can serve as a stepping stone toward classifying all finite partial-monitoring games.}, Author = {Bart{\'o}k, G. and P{\'a}l, D. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ALT}, Crossref = {ALT2010}, Date = {2010-10}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:53:04 -0700}, Doi = {10.1007/978-3-642-16108-7_20}, Editor = {Hutter, M. and Stephan, F. and Vovk, V. and Zeugmann, T.}, Keywords = {game theory, online learning, adversarial setting, theory, partial information}, Month = {October}, Pages = {224--238}, Pdf = {papers/partial-monitoring.pdf}, Title = {Toward a Classification of Finite Partial-Monitoring Games}, Year = {2010}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-16108-7_20}} @inproceedings{bubeck2008, Abstract = {We consider a generalization of stochastic bandit problems where the set of arms, X, is allowed to be a generic topological space and the mean-payoff function is ``locally Lipschitz'' with respect to a dissimilarity function that is known to the decision maker. Under this condition we construct an arm selection policy whose regret improves upon previous results for a large class of problems. In particular, our results imply that if X is the unit hypercube in a Euclidean space and the mean-payoff function has a finite number of global maxima around which the behavior of the function is locally H{\"o}lder with a known exponent, then the expected regret is bounded up to a logarithmic factor by sqrt(n), i.e., the rate of the growth of the regret is independent of the dimension of the space. We also prove the minimax optimality of our algorithm for the class of problems considered.}, Author = {Bubeck, S. and Munos, R. and Stoltz, G. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Crossref = {NIPS21}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2012-01-21 16:46:10 -0700}, Editor = {Koller, D. and Schuurmans, D. and Bengio, Y. and Bottou, L.}, Ee = {http://books.nips.cc/papers/files/nips21/NIPS2008_0553.pdf}, Keywords = {bandits, multi-armed bandits, large action space, stochastic bandits, theory, minimax bounds}, Pages = {201--208}, Pdf = {papers/HOO-NIPS08.pdf}, Publisher = {MIT Press}, Title = {Online Optimization in {X}-armed Bandits}, Year = {2008}} @inproceedings{farahmand2008, Abstract = {We consider planning in a Markovian decision problem, i.e., the problem of finding a good policy given access to a generative model of the environment. We propose to use fitted Q-iteration with penalized (or regularized) least-squares regression as the regression subroutine to address the problem of controlling model-complexity. The algorithm is presented in detail for the case when the function space is a reproducing kernel Hilbert space underlying a user-chosen kernel function. We derive bounds on the quality of the solution and argue that data-dependent penalties can lead to almost optimal performance. A simple example is used to illustrate the benefits of using a penalized procedure.}, Author = {Farahmand, A.{m}. and Ghavamzadeh, M. and Szepesv{\'a}ri, {Cs}. and Mannor, S.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {EWRL}, Crossref = {EWRL2008}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:52:54 -0700}, Doi = {10.1007/978-3-540-89722-4_5}, Keywords = {reinforcement learning, planning, regularization, nonparametrics, theory, function approximation, value iteration}, Pages = {55--68}, Pdf = {papers/RegFQI-Plan-EWRL08.pdf}, Title = {Regularized Fitted {Q}-Iteration: Application to Planning}, Year = {2008}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-89722-4_5}} @inproceedings{farahmand2008a, Abstract = {In this paper we consider approximate policy-iteration-based reinforcement learning algorithms. In order to implement a flexible function approximation scheme we propose the use of non-parametric methods with regularization, providing a convenient way to control the complexity of the function approximator. We propose two novel regularized policy iteration algorithms by adding L2-regularization to two widely-used policy evaluation methods: Bellman residual minimization (BRM) and least-squares temporal difference learning (LSTD). We derive efficient implementation for our algorithms when the approximate value-functions belong to a reproducing kernel Hilbert space. We also provide finite-sample performance bounds for our algorithms and show that they are able to achieve optimal rates of convergence under the studied conditions.}, Author = {Farahmand, A.{m}. and Ghavamzadeh, M. and Szepesv{\'a}ri, {Cs}. and Mannor, S.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Crossref = {NIPS21}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:51:02 -0700}, Ee = {http://books.nips.cc/papers/files/nips21/NIPS2008_0871.pdf}, Keywords = {reinforcement learning, regularization, nonparametrics, theory, function approximation, policy iteration}, Pages = {441--448}, Pdf = {papers/nips08-regrl.pdf}, Title = {Regularized Policy Iteration}, Year = {2008}} @inproceedings{farahmand2007, Abstract = {Intuitively, learning should be easier when the data points lie on a low-dimensional submanifold of the input space. Recently there has been a growing interest in algorithms that aim to exploit such geometrical properties of the data. Oftentimes these algorithms require estimating the dimension of the manifold first. In this paper we propose an algorithm for dimension estimation and study its finite-sample behaviour. The algorithm estimates the dimension locally around the data points using nearest neighbor techniques and then combines these local estimates. We show that the rate of convergence of the resulting estimate is independent of the dimension of the input space and hence the algorithm is ``manifold-adaptive''. Thus, when the manifold supporting the data is low dimensional, the algorithm can be exponentially more efficient than its counterparts that are not exploiting this property. Our computer experiments confirm the obtained theoretical results.}, Author = {Farahmand, A.{m}. and Szepesv{\'a}ri, {Cs}. and Audibert, J.-Y.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {ICML}, Crossref = {ICML2007}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:49:51 -0700}, Ee = {http://doi.acm.org/10.1145/1273496.1273530}, Keywords = {unsupervised learning, dimension estimation, theory, manifold learning}, Pages = {265--272}, Pdf = {papers/dimicml.pdf}, Title = {Manifold-adaptive Dimension Estimation}, Year = {2007}} @inproceedings{kocsis2006a, Abstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.}, Author = {Kocsis, L. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ECML}, Crossref = {ECML06}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:57:10 -0700}, Keywords = {reinforcement learning, learning in games, Monte-Carlo methods, Monte-Carlo tree search, UCT, bandits}, Pages = {282--293}, Pdf = {papers/ecml06.pdf}, Title = {Bandit based {M}onte-{C}arlo Planning}, Year = {2006}} @inproceedings{maei2010, Abstract = {We present the first temporal-difference learning algorithm for off-policy control with unrestricted linear function approximation whose per-time-step complexity is linear in the number of features. Our algorithm, Greedy-GQ, is an extension of recent work on gradient temporal-difference learning, which has hitherto been restricted to a prediction (policy evaluation) setting, to a control setting in which the target policy is greedy with respect to a linear approximation to the optimal action-value function. A limitation of our control setting is that we require the behavior policy to be stationary. We call this setting latent learning because the optimal policy, though learned, is not manifest in behavior. Popular off-policy algorithms such as Q-learning are known to be unstable in this setting when used with linear function approximation.}, Author = {Maei, H.R. and Szepesv{\'a}ri, {Cs}. and Bhatnagar, S. and Sutton, R.S.}, Booktitle = {ICML}, Crossref = {ICML2010}, Date = {2010-06}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:14 -0700}, Editor = {F{\"u}rnkranz, J. and Joachims, T.}, Keywords = {reinforcement learning, control learning, online learning, gradient algorithm, stochastic approximation, theory, function approximation, nonlinear function approximation, neural networks}, Month = {June}, Pages = {719--726}, Pdf = {papers/ICML10_controlGQ.pdf}, Publisher = {Omnipress}, Title = {Toward Off-Policy Learning Control with Function Approximation}, Year = {2010}} @inproceedings{mnih2008, Abstract = {Sampling is a popular way of scaling up machine learning algorithms to large datasets. The question often is how many samples are needed. Adaptive stopping algorithms monitor the performance in an online fashion and they can stop early, saving valuable resources. We consider problems where probabilistic guarantees are desired and demonstrate how recently-introduced empirical Bernstein bounds can be used to design stopping rules that are efficient. We provide upper bounds on the sample complexity of the new rules, as well as empirical results on model selection and boosting in the filtering setting.}, Author = {Mnih, V. and Szepesv{\'a}ri, {Cs}. and Audibert, J.-Y.}, Booktitle = {ICML}, Crossref = {ICML2008}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:49:54 -0700}, Doi = {10.1145/1390156.1390241}, Keywords = {Bernstein's inequality, sequential algorithms, stopping problem, racing problem, pick the winner, theory}, Pages = {672--679}, Pdf = {papers/bernstein-stopping.pdf}, Title = {Empirical {B}ernstein stopping}, Year = {2008}, Bdsk-Url-1 = {http://dx.doi.org/10.1145/1390156.1390241}} @inproceedings{sutton2008a, Abstract = {We consider the problem of efficiently learning optimal control policies and value functions over large state spaces in an online setting in which estimates must be available after each interaction with the world. This paper develops an explicitly model-based approach extending the Dyna architecture to linear function approximation. Dyna-style planning proceeds by generating imaginary experience from the world model and then applying model-free reinforcement learning algorithms to the imagined state transitions. Our main results are to prove that linear Dyna-style planning converges to a unique solution independent of the generating distribution, under natural conditions. In the policy evaluation setting, we prove that the limit point is the least-squares (LSTD) solution. An implication of our results is that prioritized-sweeping can be soundly extended to the linear approximation case, backing up to preceding features rather than to preceding states. We introduce two versions of prioritized sweeping with linear Dyna and briefly illustrate their performance empirically on the Mountain Car and Boyan Chain problems.}, Author = {Sutton, R.S. and Szepesv{\'a}ri, {Cs}. and Geramifard, A. and Bowling, M. H.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {UAI}, Crossref = {UAI2008}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:53:46 -0700}, Ee = {http://uai2008.cs.helsinki.fi/UAI_camera_ready/sutton.pdf}, Keywords = {reinforcement learning, planning, theory, stochastic approximation, asymptotic convergence, function approximation}, Pages = {528--536}, Pdf = {papers/linearDyna.pdf}, Title = {Dyna-Style Planning with Linear Function Approximation and Prioritized Sweeping}, Year = {2008}} @inproceedings{sutton2008, Abstract = {We introduce the first temporal-difference learning algorithm that is stable with linear function approximation and off-policy training, for any finite Markov decision process, behavior policy, and target policy, and whose complexity scales linearly in the number of parameters. We consider an i.i.d. policy-evaluation setting in which the data need not come from on-policy experience. The gradient temporal-difference (GTD) algorithm estimates the expected update vector of the TD(0) algorithm and performs stochastic gradient descent on its L2 norm. We prove that this algorithm is stable and convergent under the usual stochastic approximation conditions to the same least-squares solution as found by the LSTD, but without LSTD's quadratic computational complexity. GTD is online and incremental, and does not involve multiplying by products of likelihood ratios as in importance-sampling methods.}, Author = {Sutton, R.S. and Szepesv{\'a}ri, {Cs}. and Maei, H.R.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {NIPS}, Crossref = {NIPS21}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:58 -0700}, Ee = {http://books.nips.cc/papers/files/nips21/NIPS2008_0421.pdf}, Keywords = {reinforcement learning, prediction, online learning, gradient algorithm, stochastic approximation, theory, function approximation, GTD}, Pages = {1609--1616}, Pdf = {papers/gtdnips08.pdf}, Title = {A Convergent {O}(n) Algorithm for Off-policy Temporal-difference Learning with Linear Function Approximation}, Year = {2008}} @inproceedings{szepesvari1997b, Abstract = {In this paper we show that for discounted MDPs with discount factor $\gamma>1/2$ the asymptotic rate of convergence of Q-learning is O($1/t^{R(1-\gamma$)}) if $R(1-\gamma)<1/2$ and O($\sqrt{\log\log t/ t}$) otherwise provided that the state-action pairs are sampled from a fixed probability distribution. Here $R=p_{min}/p_{max}$ is the ratio of the minimum and maximum state-action occupation frequencies. The results extend to convergent on-line learning provided that $p_{min}>0$, where $p_{min}$ and $p_{max}$ now become the minimum and maximum state-action occupation frequencies corresponding to the stationary distribution.}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Crossref = {NIPS10}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:27 -0700}, Keywords = {reinforcement learning, control learning, finite MDPs, online learning, stochastic approximation, theory}, Pages = {1064--1070}, Pdf = {papers/nips97.ps.pdf}, Title = {The Asymptotic Convergence-Rate of {Q}-learning}, Year = {1997}} @inproceedings{szepesvari2004, Abstract = {We consider a variant of Q-learning in continuous state spaces under the total expected discounted cost criterion combined with local function approximation methods. Provided that the function approximator satisfies certain interpolation properties, the resulting algorithm is shown to converge with probability one. The limit function is shown to satisfy a fixed point equation of the Bellman type, where the fixed point operator depends on the stationary distribution of the exploration policy and approximation properties of the function approximation method. The basic algorithm is extended in several ways. In particular, a variant of the algorithm is obtained that is shown to converge in probability to the optimal Q function. Preliminary computer simulations confirm the validity of the approach.}, Author = {Szepesv{\'a}ri, {Cs}. and Smart, W. D.}, Booktitle = {ICML}, Crossref = {ICML2004}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:49:43 -0700}, Keywords = {reinforcement learning, control learning, online learning, stochastic approximation, theory, function approximation}, Pages = {791--798}, Pdf = {papers/szws_icml2004_rlfapp.pdf}, Title = {Interpolation-based {Q}-learning}, Year = {2004}} @inproceedings{szita2010, Abstract = {A strong selling point of using a model in reinforcement learning is that model-based algorithms can propagate the obtained experience more quickly, and are able to direct exploration better. As a consequence, fewer exploratory actions are enough to learn a good policy. Strangely enough, current theoretical results for model-based algorithms do not support this claim: In a Markov decision process with N states, the best bounds on the number of exploratory steps necessary are of order $O(N^2 \log N)$, in contrast to the $O(N \log N)$ bound available for the model-free, delayed Q-learning algorithm. In this paper we show that a modified version of the Rmax algorithm needs to make at most $O(N \log N)$ exploratory steps. This matches the lower bound up to logarithmic factors, as well as the upper bound of the state-of-the-art model-free algorithm, while our new bound improves the dependence on the discount factor gamma.}, Author = {Szita, I. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Crossref = {ICML2010}, Date = {2010-06}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:11 -0700}, Editor = {F{\"u}rnkranz, J. and Joachims, T.}, Ee = {http://www.icml2010.org/papers/546.pdf}, Keywords = {reinforcement learning, PAC-learning, theory, exploration vs. exploitation, sequential algorithms}, Month = {June}, Pages = {1031--1038}, Pdf = {papers/ICML10_rmax_improved.pdf}, Publisher = {Omnipress}, Title = {Model-based Reinforcement Learning with Nearly Tight Exploration Complexity Bounds}, Year = {2010}} @article{a.antos2010, Abstract = {We consider the problem of actively learning the mean values of distributions associated with a finite number of options. The decision maker can select which option to generate the next observation from, the goal being to produce estimates with equally good precision for all the options. If sample means are used to estimate the unknown values then the optimal solution, assuming that the distributions are known up to a shift, is to sample from each distribution proportional to its variance. No information other than the distributions' variances is needed to calculate the optimal solution. In this paper we propose an incremental algorithm that asymptotically achieves the same loss as an optimal rule. We prove that the excess loss su ered by this algorithm, apart from logarithmic factors, scales as $n^{(-3/2)}$, which we conjecture to be the optimal rate. The performance of the algorithm is illustrated on a simple problem.}, Author = {Antos, A. and Grover, V. and Szepesv{\'a}ri, {Cs}.}, Date = {2010-06}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:42:00 -0700}, Doi = {10.1016/j.tcs.2010.04.007}, Issn = {0304-3975}, Journal = {Theoretical Computer Science}, Keywords = {active learning, regression, sequential algorithms, theory}, Month = {June}, Number = {29--30}, Pages = {2712--2728}, Pdf = {papers/Allocation-TCS10.pdf}, Title = {Active Learning in Heteroscedastic Noise}, Volume = {411}, Year = {2010}, Bdsk-Url-1 = {http://dx.doi.org/10.1016/j.tcs.2010.04.007}} @inproceedings{a.farhangfar2009, Abstract = {We address the task of actively learning a segmentation system: given a large number of unsegmented images, and access to an oracle that can segment a given image, decide which images to provide, to quickly produce a segmenter (here, a discriminative random field) that is accurate over this distribution of images. We extend the standard models for active learner to define a system for this task that first selects the image whose expected label will reduce the uncertainty of the other unlabeled images the most, and then after greedily selects, from the pool of unsegmented images, the most informative image. The results of our experiments, over two real-world datasets (segmenting brain tumors within magnetic resonance images; and segmenting the sky in real images) show that training on very few informative images (here, as few as 2) can produce a segmenter that is as good as training on the entire dataset.}, Author = {Farhangfar, A. and Greiner, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date-Modified = {2010-11-25 00:50:08 -0700}, Keywords = {active learning, supervised learning, structured prediction}, Pages = {1-- 2}, Pdf = {papers/LearningToSegment-ICML09.pdf}, Timestamp = {2010.08.31}, Title = {Learning to Segment from a Few Well-Selected Training Images}, Year = {2009}} @inproceedings{a.isaza2008, Abstract = {In this paper, we consider planning in stochastic shortest path problems, a subclass of Markov Decision Problems (MDP). We focus on medium-size problems whose state space can be fully enumerated. This problem has numerous important applications, such as navigation and planning under uncertainty. We propose a new approach for constructing a multi-level hierarchy of progressively simpler abstractions of the original problem. Once computed, the hierarchy can be used to speed up planning by first finding a policy for the most abstract level and then recursively refining it into a solution to the original problem. This approach is fully automated and delivers a speed-up of two orders of magnitude over a state-of-the-art MDP solver on sample problems while returning near-optimal solutions.}, Author = {Isaza, A. and Szepesv{\'a}ri, {Cs}. and Bulitko, V. and Greiner, R.}, Booktitle = {UAI}, Date-Modified = {2010-11-25 00:53:43 -0700}, Keywords = {planning, finite MDPs, abstraction, macro learning, shortest path problem, options}, Owner = {Beata}, Pages = {306--314}, Pdf = {papers/prmdp.pdf}, Timestamp = {2010.08.29}, Title = {Speeding Up Planning in {M}arkov Decision Processes via Automatically Constructed Abstractions}, Year = {2008}} @inproceedings{a.kocsor2004, Abstract = {We propose a new feature extraction method called Margin Maximizing Discriminant Analysis (MMDA) which seeks to extract features suitable for classification tasks. MMDA is based on the principle that an ideal feature should convey the maximum information about the class labels and it should depend only on the geometry of the optimal decision boundary and not on those parts of the distribution of the input data that do not participate in shaping this boundary. Further, distinct feature components should convey unrelated information about the data. Two feature extraction methods are proposed for calculating the parameters of such a projection that are shown to yield equivalent results. The kernel mapping idea is used to derive non-linear versions. Experiments with several real-world, publicly available data sets demonstrate that the new method yields competitive results. }, Author = {Kocsor, A. and Korn{\'e}l, K. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ECML/PKDD-2004}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {classification, supervised learning, supervised feature extraction, kernels}, Owner = {Beata}, Pages = {1-- 2}, Pdf = {papers/mmda-ecml2004.pdf}, Timestamp = {2010.08.31}, Title = {Margin Maximizing Discriminant Analysis}, Year = {2004}} @article{a.lorincz2001, Abstract = {A functional model of the basal ganglia-thalamocortical (BTC) loops is described. In our modeling effort, we try to minimize the complexity of our starting hypotheses. For that reason, we call this type of modeling Ockham's razor modeling. We have the additional constraint that the starting assumptions should not contradict experimental findings about the brain. First assumption: The brain lacks direct representation of paths but represents directions (called speed fields in control theory). Then control should be concerned with speed-field tracking (SFT). Second assumption: Control signals are delivered upon differencing in competing parallel channels of the BTC loops. This is modeled by extending SFT with differencing that gives rise to the robust Static and Dynamic State (SDS) feedback-controlling scheme. Third assumption: Control signals are expressed in terms of a gelatinous medium surrounding the limbs. This is modeled by expressing parameters of motion in parameters of the external space. We show that corollaries of the model fit properties of the BTC loops. The SDS provides proper identification of motion related neuronal groups of the putamen. Local minima arise during the controlling process that works in external space. The model explains the presence of parallel channels as the means to avoiding such local minima. Stability conditions of the SDS predict that the initial phase of learning is mostly concerned with selection of sign for the inverse dynamics. The model provides a scalable controller. State description in external space instead of configurational space reduces the dimensionality problem. Falsifying experiment is suggested. Computer experiments demonstrate the feasibility of the approach. We argue that the resulting scheme has a straightforward connectionist representation exhibiting population coding and Hebbian learning properties. }, Author = {L{\"o}rincz, A. and H{\'e}vizi, Gy. and Szepesv{\'a}ri, {Cs}.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {International Journal of Neural Systems}, Keywords = {biological modeling, neural networks}, Owner = {Beata}, Pages = {1-- 2}, Timestamp = {2010.08.31}, Title = {Ockham's Razor Modeling of the Matrisome Channels of the Basal Ganglia Thalamocortical Loop}, Url = {http://ejournals.wspc.com.sg/ijns/11/1102/S0129065701000412.html}, Volume = {11}, Year = {2001}, Bdsk-Url-1 = {http://ejournals.wspc.com.sg/ijns/11/1102/S0129065701000412.html}} @inproceedings{antos2008, Abstract = {In this paper we consider the problem of actively learning the mean values of distributions associated with a finite number of options (arms). The algorithms can select which option to generate the next sample from in order to produce estimates with equally good precision for all the distributions. When an algorithm uses sample means to estimate the unknown values then the optimal solution, assuming full knowledge of the distributions, is to sample each option proportional to its variance. In this paper we propose an incremental algorithm that asymptotically achieves the same loss as an optimal rule. We prove that the excess loss suffered by this algorithm, apart from logarithmic factors, scales as $1/n^{(3/2)}$, which we conjecture to be the optimal rate. The performance of the algorithm is illustrated in a simple problem.}, Author = {Antos, A. and Grover, V. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ALT}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:55:25 -0700}, Keywords = {active learning, regression, sequential algorithms, theory}, Note = {See \cite{a.antos2010} for an extended version}, Pages = {287--302}, Pdf = {papers/Allocation.pdf}, Publisher = {Springer-Verlag}, Series = {Lecture Notes in Computer Science 5254}, Title = {Active learning in Multi-armed bandits}, Year = {2008}} @inproceedings{antos2007a, Abstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems when the training data is composed of the trajectory of some fixed behaviour policy. The algorithm studied is policy iteration where in successive iterations the action-value functions of the intermediate policies are obtained by means of approximate value iteration. PAC-style polynomial bounds are derived on the number of samples needed to guarantee near optimal performance. The bounds depend on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used. One of the main novelties of the paper is that new smoothness constraints are introduced thereby significantly extending the scope of previous results.}, Author = {Antos, A. and Szepesv{\'a}ri, {Cs}. and Munos, R.}, Booktitle = {2007 {IEEE} Symposium on Approximate Dynamic Programming and Reinforcement Learning ({ADPRL} 2007)}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-05 00:56:00 -0600}, Keywords = {reinforcement learning, nonparametrics, theory, function approximation, policy iteration}, Note = {(Honolulu, Hawaii, Apr 1--5, 2007.)}, Pages = {330--337}, Pdf = {papers/sapi_adprl4aa.pdf}, Publisher = {IEEE}, Title = {Value-iteration Based Fitted Policy Iteration: Learning with a Single Trajectory}, Year = {2007}} @inproceedings{antos2006, Abstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.}, Author = {Antos, A. and Szepesv{\'a}ri, {Cs}. and Munos, R.}, Booktitle = {COLT}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2011-07-26 10:02:37 -0600}, Keywords = {reinforcement learning, nonparametrics, theory, function approximation, policy iteration}, Pages = {574--588}, Pdf = {papers/colt2006.pdf}, Title = {Learning Near-optimal Policies with {B}ellman-residual Minimization based Fitted Policy Iteration and a Single Sample Path}, Year = {2006}} @article{audibert2009, Abstract = {Algorithms based on upper confidence bounds for balancing exploration and exploitation are gaining popularity since they are easy to implement, efficient and effective. This paper considers a variant of the basic algorithm for the stochastic, multi-armed bandit problem that takes into account the empirical variance of the different arms. In earlier experimental works, such algorithms were found to outperform the competing algorithms. We provide the first analysis of the expected regret for such algorithms. As expected, our results show that the algorithm that uses the variance estimates has a major advantage over its alternatives that do not use such estimates provided that the variances of the payoffs of the suboptimal arms are low. We also prove that the regret concentrates only at a polynomial rate. This holds for all the upper confidence bound based algorithms and for all bandit problems except those special ones where with probability one the payoff obtained by pulling the optimal arm is larger than the expected payoff for the second best arm. Hence, although upper confidence bound bandit algorithms achieve logarithmic expected regret rates, they might not be suitable for a risk-averse decision maker. We illustrate some of the results by computer simulations.}, Author = {Audibert, J.-Y. and Munos, R. and Szepesv{\'a}ri, {Cs}.}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Ee = {http://dx.doi.org/10.1016/j.tcs.2009.01.016}, Journal = {Theoretical Computer Science}, Keywords = {multi-armed bandits, sequential algorithms, stochastic bandits, Bernstein's inequality, theory}, Number = {19}, Pages = {1876--1902}, Pdf = {papers/ucbtuned-journal.pdf}, Title = {Exploration-exploitation Tradeoff using Variance Estimates in Multi-armed Bandits}, Volume = {410}, Year = {2009}} @inproceedings{audibert2007, Abstract = {Algorithms based on upper-confidence bounds for balancing exploration and exploitation are gaining popularity since they are easy to implement, efficient and effective. In this paper we consider a variant of the basic algorithm for the stochastic, multi-armed bandit problem that takes into account the empirical variance of the different arms. In earlier experimental works, such algorithms were found to outperform the competing algorithms. The purpose of this paper is to provide a theoretical explanation of these findings and provide theoretical guidelines for the tuning of the parameters of these algorithms. For this we analyze the expected regret and for the first time the concentration of the regret. The analysis of the expected regret shows that variance estimates can be especially advantageous when the payoffs of suboptimal arms have low variance. The risk analysis, rather unexpectedly, reveals that except some very special bandit problems, for upper confidence bound based algorithms with standard bias sequences, the regret concentrates only at a polynomial rate. Hence, although these algorithms achieve logarithmic expected regret rates, they seem less attractive when the risk of achieving much worse than logarithmic cumulative regret is also taken into account.}, Author = {Audibert, J.-Y. and Munos, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ALT}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2012-06-06 21:39:14 -0600}, Keywords = {multi-armed bandits, sequential algorithms, stochastic bandits, Bernstein's inequality, theory}, Note = {See \cite{audibert2009} for a longer, updated version}, Pages = {150--165}, Pdf = {papers/ucb_alt.pdf}, Ppt = {talks/ALT07-UCBTuned-Talk.ppt}, Publisher = {Springer}, Title = {Tuning Bandit Algorithms in Stochastic Environments}, Year = {2007}} @inproceedings{auer2007, Abstract = {Considering one-dimensional continuum-armed bandit problems, we propose an improvement of an algorithm of Kleinberg and a new set of conditions which give rise to improved rates. In particular, we introduce a novel assumption that is complementary to the previous smoothness conditions, while at the same time smoothness of the mean payoff function is required only at the maxima. Under these new assumptions new bounds on the expected regret are derived. In particular, we show that apart from logarithmic factors, the expected regret scales with the square-root of the number of trials, provided that the mean payoff function has finitely many maxima and its second derivatives are continuous and non-vanishing at the maxima. This improves a previous result of Cope by weakening the assumptions on the function. We also derive matching lower bounds. To complement the bounds on the expected regret, we provide high probability bounds which exhibit similar scaling.}, Author = {Auer, P. and Ortner, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {COLT}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:57:36 -0700}, Keywords = {bandits, multi-armed bandits, large action space, stochastic bandits, theory, minimax bounds}, Pages = {454--468}, Pdf = {papers/ContinuousBandits.pdf}, Title = {Improved Rates for the Stochastic Continuum-Armed Bandit Problem}, Year = {2007}} @inproceedings{b.poczos2010, Abstract = {Most learning algorithms assume that a data set is given initially. We address the common situation where data is not available initially, but can be obtained, at a cost. We focus on learning Bayesian belief networks (BNs) over discrete variables. As such BNs are models of probabilistic distributions, we consider the ``generative'' challenge of learning the parameters, for a fixed structure, that best match the true distribution. We focus on the budgeted learning setting, where there is a known fixed cost $c_i$ for acquiring the value of the i-th feature for any specified instance, and a known total cost to spend acquiring all information. After formally defining this problem from a Bayesian perspective, we first consider allocation algorithms that must decide, before seeing any results, which features of which instances to probe. We show this is NP-hard, even if all variables are independent, then prove that the greedy allocation algorithm IGA is optimal when the costs are uniform and the features are independent, but can otherwise be sub-optimal. We then show that general (non-allocation) policies perform better, and explore the challenges of learning the parameters for general belief networks in this setting, describing conditions for when the obvious round-robin algorithm will, versus will not work optimally. We also explore the effectiveness of this and various other heuristic algorithms.}, Author = {Li, L. and P{\'o}czos, B. and Greiner, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date = {2010-06}, Date-Modified = {2010-11-25 00:50:18 -0700}, Editor = {F{\"u}rnkranz, J. and Joachims, T.}, Ee = {http://www.icml2010.org/papers/406.pdf}, Keywords = {budgeted learning, sequential algorithms, unsupervised learning, density estimation}, Month = {June}, Owner = {Beata}, Pages = {879--886}, Pdf = {papers/BDL_ICML2010.pdf}, Publisher = {Omnipress}, Timestamp = {2010.08.31}, Title = {Budgeted Distribution Learning of Belief Net Parameters}, Year = {2010}} @inproceedings{balogh2000, Abstract = {The TTS system described in this paper is based on the analysis and resynthesis of a given speaker's voice. FlexVoice provides large flexibility in changing voice properties independently from the vocal tract parameters. This flexibility can be demonstrated by a number of voice conversions including female-to-male and female-to-child conversions. FlexVoice only uses a fraction of the resources of a PC and its quality is comparable to that of the leading TTS systems.}, Address = {London}, Author = {Balogh, {Gy}. and Dobler, E. and Gr{\Ho}bler, T. and Smodics, B. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {IEE Seminar on State-of-the-Art in Speech Synthesis}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-04 12:21:09 -0600}, Keywords = {application, text-to-speech synthesis}, Month = {april}, Pages = {15/1--15/6}, Pdf = {papers/flexvoice-iee.ps.pdf}, Publisher = {IEE Electronics \& Communications}, Title = {{F}lex{V}oice: a Parametric Approach to High-quality Speech-synthesis}, Year = {2000}} @inproceedings{bartok2008, Abstract = {The question investigated in this paper is to what extent an input representation influences the success of learning, in particular from the point of view of analyzing agents that can interact with their environment. We investigate learning environments that have a group structure. We introduce a learning model in different variants and study under which circumstances group structures can be learned efficiently from experimenting with group generators (actions). Negative results are presented, even without efficiency constraints, for rather general classes of groups showing that even with group structure, learning an environment from partial information is far from trivial. However, positive results for special subclasses of Abelian groups turn out to be a good starting point for the design of efficient learning algorithms based on structured representations.}, Author = {Bart{\'o}k, G. and Szepesv{\'a}ri, {Cs}. and Zilles, S.}, Booktitle = {ALT}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:55:31 -0700}, Doi = {10.1007/978-3-540-87987-9_28}, Keywords = {active learning, sequential algorithms, theory}, Note = {See \cite{bartok2010a} for a longer, updated version}, Pages = {329--343}, Pdf = {papers/bartokSZ08group.pdf}, Publisher = {Springer-Verlag}, Series = {Lecture Notes in Computer Science 5254}, Title = {Active Learning of Group-Structured Environments}, Year = {2008}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-87987-9_28}} @article{bartok2010a, Abstract = {We investigate the problem of learning the transition dynamics of deterministic, discrete-state environments. We assume that an agent exploring such an environment is able to perform actions (from a finite set of actions) in the environment and to sense the state changes. The question investigated is whether the agent can learn the dynamics without visiting all states. Such a goal is unrealistic in general, hence we assume that the environment has structural properties an agent might exploit. In particular, we assume that the set of all action sequences forms an algebraic group. We introduce a learning model in different variants and study under which circumstances the corresponding ``group structured environments'' can be learned efficiently by experimenting with group generators (actions). It turns out that for some classes of such environments the choice of actions given to the agent determines if efficient learning is possible. Negative results are presented, even without efficiency constraints, for rather general classes of groups, showing that even with group structure, learning an environment from partial information is far from trivial. However, positive results for special subclasses of Abelian groups turn out to be a good starting point for the design of efficient learning algorithms based on structured representations.}, Author = {Bart{\'o}k, G. and Szepesv{\'a}ri, {Cs}. and Zilles, S.}, Date = {2010-04}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:42:38 -0700}, Doi = {10.1016/j.ic.2009.09.001}, Journal = {Information and Computation}, Keywords = {active learning, sequential algorithms, theory}, Month = {April}, Pages = {364--384}, Pdf = {papers/bartokSZ09groups.pdf}, Title = {Models of Active Learning in Group-structured State Spaces}, Volume = {208}, Year = {2010}, Bdsk-Url-1 = {http://dx.doi.org/10.1016/j.ic.2009.09.001}} @techreport{beleznay1999, Abstract = {We compare scaling properties of several value-function estimation algorithms. In particular, we prove that Q-learning can scale exponentially slowly with the number of states. We identify the reasons of the slow convergence and show that both TD($\lambda$) and learning with a fixed learning-rate enjoy rather fast convergence, just like the model-based method.}, Address = {Budapest 1121, Konkoly Th. M. u. 29-33, Hungary}, Author = {Beleznay, F. and Gr{\Ho}bler, T. and Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Institution = {Mindmaker Ltd.}, Keywords = {theory, rate of convergence, reinforcement learning}, Number = {TR-99-02}, Pdf = {papers/slowql-tr99-02.ps.pdf}, Title = {Comparing Value-Function Estimation Algorithms in Undiscounted Problems}, Year = {1999}} @article{bubeck2010, Abstract = {We consider a generalization of stochastic bandits where the set of arms, X, is allowed to be a generic measurable space and the mean-payoff function is "locally Lipschitz" with respect to a dissimilarity function that is known to the decision maker. Under this condition we construct an arm selection policy, called HOO (hierarchical optimistic optimization), with improved regret bounds compared to previous results for a large class of problems. In particular, our results imply that if X is the unit hypercube in a Euclidean space and the mean-payoff function has a finite number of global maxima around which the behavior of the function is locally continuous with a known smoothness degree, then the expected regret of HOO is bounded up to a logarithmic factor by √n, that is, the rate of growth of the regret is independent of the dimension of the space. We also prove the minimax optimality of our algorithm when the dissimilarity is a metric. Our basic strategy has quadratic computational complexity as a function of the number of time steps and does not rely on the doubling trick. We also introduce a modified strategy, which relies on the doubling trick but runs in linearithmic time. Both results are improvements with respect to previous approaches.}, Author = {Bubeck, S. and Munos, R. and Stoltz, G. and Szepesv{\'a}ri, {Cs}.}, Date = {2011-06}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2011-07-04 11:16:14 -0600}, Ee = {http://www.jmlr.org/papers/volume12/bubeck11a/bubeck11a.pdf}, Journal = {Journal of Machine Learning Research}, Keywords = {bandits, multi-armed bandits, large action space, stochastic bandits, theory, minimax bounds}, Month = {June}, Note = {Submitted on 21/1/2010}, Pages = {1655--1695}, Pdf = {papers/BMSS10.pdf}, Title = {X-Armed Bandits}, Volume = {12}, Year = {2011}} @inproceedings{cs.szepesvari2004, Abstract = {In this paper we consider two novel kernel machine based feature extraction algorithms in a regression settings. The first method is derived based on the principles underlying the recently introduced Maximum Margin Discrimination Analysis (MMDA) algorithm. However, here it is shown that the orthogonalization principle employed by the original MMDA algorithm can be motivated using the well-known ambiguity decomposition, thus providing a firm ground for the good performance of the algorithm. The second algorithm combines kernel machines with average derivative estimation and is derived from the assumption that the true regressor function depends only on a subspace of the original input space. The proposed algorithms are evaluated in preliminary experiments conducted with artificial and real datasets. }, Author = {Szepesv{\'a}ri, {Cs}. and Korn{\'e}l, K. and Kocsor, A.}, Booktitle = {ECAI}, Date-Modified = {2010-11-25 00:59:12 -0700}, Keywords = {supervised feature extraction, supervised learning, regression}, Owner = {Beata}, Pages = {1-- 2}, Pdf = {papers/ecai-mmda.pdf}, Timestamp = {2010.08.31}, Title = {Kernel Machine Based Feature Extraction Algorithm for Regression Problems}, Year = {2004}} @inproceedings{farahmand2009, Abstract = {Reinforcement learning with linear and non-linear function approximation has been studied extensively in the last decade. However, as opposed to other fields of machine learning such as supervised learning, the effect of finite sample has not been thoroughly addressed within the reinforcement learning framework. In this paper we propose to use $L^2$ regularization to control the complexity of the value function in reinforcement learning and planning problems. We consider the Regularized Fitted Q-Iteration algorithm and provide generalization bounds that account for small sample sizes. Finally, a realistic visual-servoing problem is used to illustrate the benefits of using the regularization procedure.}, Author = {Farahmand, A.{m}. and Ghavamzadeh, M. and Szepesv{\'a}ri, {Cs}. and Mannor, S.}, Booktitle = {ACC}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:56:08 -0700}, Keywords = {reinforcement learning, planning, regularization, nonparametrics, theory, function approximation, value iteration}, Pages = {725--730}, Pdf = {papers/RegPlan-ACC09.pdf}, Title = {Regularized Fitted {Q}-iteration for Planning in Continuous-Space Markovian Decision Problems}, Year = {2009}} @inproceedings{farahmand2009a, Abstract = {To address the difficulty of designing a controller for complex visual-servoing tasks, two learning-based uncalibrated approaches are introduced. The first method starts by building an estimated model for the visual-motor forward kinematic of the vision-robot system by a locally linear regression method. Afterwards, it uses a reinforcement learning method named Regularized Fitted Q-Iteration to find a controller (i.e. policy) for the system (model-based RL). The second method directly uses samples coming from the robot without building any intermediate model (model-free RL). The simulation results show that both methods perform comparably well despite not having any a priori knowledge about the robot.}, Author = {Farahmand, A.{m}. and Shademan, A. and J{\"a}gersand, M. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICRA}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {robotics, application, vision, reinforcement learning}, Pages = {2917--2924}, Pdf = {papers/MBRL4VisualServoing.pdf}, Title = {Model-based and Model-free Reinforcement Learning for Visual Servoing}, Year = {2009}} @inproceedings{fomin1994, Abstract = {Self-organizing neural network solutions to control problems are described. Competitive networks create spatial filters and geometry connections in a self-organizing fashion. The goal position, the obstacle and the object under control all create neural activities through the filters. Spreading activation that discriminates between the controlled object, the goal position and the obstacles is utilized on the internal representation. Local self-training method and Hebbian learning develops the self-organizing control connections. The algorithm provides maneouvering capability in unseen scenes.}, Address = {Orlando, Florida}, Author = {Fomin, T. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {Proc. of IEEE WCCI ICNN'94}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {neural networks, control}, Pages = {2777--2780}, Pdf = {papers/fomin.neucont.ps.pdf}, Publisher = {IEEE Inc.}, Title = {Self-Organizing Neurocontrol}, Volume = {5}, Year = {1994}} @article{french2000, Abstract = {We consider systems satisfying a matching condition which are functionally known up to weighted L2 and L1 measures of uncertainty. A modified LQ measure of control and state transient performance is given, and the performance of a class of approximate model based adaptive controllers is studied. An upper performance bound is derived in terms of the uncertainty models (stability and the state transient bounds require only the L2 uncertainty model; control effort bounds require both L2 and L1 uncertainty models), and various structural properties of the model basis. Sufficient conditions are given to ensure that the performance is bounded independently of the model basis size.}, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {IEEE Transactions on Automatic Control}, Keywords = {adaptive control, performance bounds, Lyapunov design, strict feedback, chain of integrators, matched uncertainty, theory, control, nonparametrics, stabilization}, Number = {2}, Pages = {353--358}, Pdf = {papers/fsr97.ps.pdf}, Title = {Uncertainty, Performance, and Model Dependency in Approximate Adaptive Nonlinear Control}, Volume = {45}, Year = {2000}} @article{french2000a, Abstract = {We consider functionally uncertain systems which can be written in an output feedback form, where the nonlinearities are functions of the output only. The uncertainty is described by a weighted L 2 norm about a nominal system, and an approximate adaptive design is given which ensures output practical stability. The main result requires knowledge of the weighted L 2 uncertainty level. An upper bound on the LQ performance of the output transient and the control input is derived, where the cost penalises the output transient and the control effort on the time interval where the output lies outside the prescribed neighbourhood of zero to which we achieve convergence.}, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Automatica}, Keywords = {adaptive control, performance bounds, Lyapunov design, output feedback, theory, control, nonparametrics, output tracking, output stabilization}, Pdf = {http://eprints.ecs.soton.ac.uk/9066/1/mcfcsetar_aut2002.pdf}, Title = {LQ performance Bounds for Adaptive Output Feedback Controllers for Functionally Uncertain Systems}, Year = {2000}} @article{french2000b, Abstract = {We consider the adaptive tracking problem for a chain of integrators, where the uncertainty is static and functional. The uncertainty is specified by L2/L1 or weighted L2/L1 norm bounds. We analyse a standard Lyapunov based adaptive design which utilizes a function approximator to induce a parametric uncertainty, on which the adaptive design is completed. Performance is measured by a modified LQ cost functional, penalising both the tracking error transient and the control effort. With such a cost functional, it is shown that a standard control design has divergent performance when the resolution of a `mono-resolution' approximator is increased. The class of `mono-resolution' approximators includes models popular in applications. A general construction of a class of approximators and their associated controllers which have a uniformly bounded performance independent of the resolution of the approximator is given.}, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-05 01:18:16 -0600}, Journal = {Mathematics of Control, Signals and Systems}, Keywords = {adaptive control, performance bounds, Lyapunov design, chain of integrators, tracking, multiresolution function approximation, theory, nonparametrics}, Pages = {1--2}, Pdf = {http://eprints.ecs.soton.ac.uk/6643/1/csmcfetar_mcss2002.pdf}, Title = {An Asymptotic Scaling Analysis of {LQ} performance for an Approximate Adaptive Control Design}, Volume = {15}, Year = {2000}} @inproceedings{french2000c, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Booktitle = {Proceedings of the International Symposium on the Mathematical Theory of Networks and Systems (MTNS 2000)}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {adaptive control, performance bounds, Lyapunov design, chain of integrators, tracking, multiresolution function approximation, theory, nonparametrics}, Title = {Scaling of {LQ} Performance in Approximate Adaptive Designs}, Year = {2000}} @inproceedings{french1998, Abstract = {We consider nonlinear systems in an output feedback form which are functionally known up to a L2 measure of uncertainty. The control task is to drive the output of the system to some neighbourhood of the origin. A modified L2 measure of transient performance (penalising both state and control effort) is given, and the performance of a class of model based adaptive controllers is studied. An upper performance bound is derived.}, Address = {Tampa, Florida}, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Booktitle = {CDC}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:56:19 -0700}, Keywords = {adaptive control, performance bounds, Lyapunov design, output feedback, theory, control, nonparametrics, output tracking, output stabilization}, Month = {December}, Pages = {4515--4520}, Pdf = {papers/cdc98.ps.pdf}, Publisher = {IEEE}, Title = {Uncertainty and Performance of Adaptive Controllers for Functionally Uncertain Output Feedback Systems}, Year = {1998}} @book{french2003, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Booktitle = {Performance of Nonlinear Approximate Adaptive Controllers}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {control, theory, adaptive control, Lyapunov design, function approximation}, Publisher = {Wiley}, Title = {Performance of Nonlinear Approximate Adaptive Controllers}, Year = {2003}} @inproceedings{g.neu2010, Abstract = {We consider a stochastic extension of the loop-free shortest path problem with adversarial rewards. In this episodic Markov decision problem an agent traverses through an acyclic graph with random transitions: at each step of an episode the agent chooses an action, receives some reward, and arrives at a random next state, where the reward and the distribution of the next state depend on the actual state and the chosen action. We consider the bandit situation when only the reward of the just visited state-action pair is revealed to the agent. For this problem we develop algorithms that perform asymptotically as well as the best stationary policy in hindsight. Assuming that all states are reachable with probability $\alpha > 0$ under all policies, we give an algorithm and prove that its regret is $O(L^2 \sqrt(T|A|)/\alpha)$, where $T$ is the number of episodes, $A$ denotes the (finite) set of actions, and $L$ is the length of the longest path in the graph. Variants of the algorithm are given that improve the dependence on the transition probabilities under specific conditions. The results are also extended to variations of the problem, including the case when the agent competes with time varying policies.}, Author = {Neu, G. and Gy{\"o}rgy, A. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {COLT}, Date = {2010-06}, Date-Added = {2010-08-28 20:45:00 -0600}, Date-Modified = {2010-11-25 00:57:40 -0700}, Keywords = {online learning, adversarial setting, finite MDPs, shortest path problem}, Month = {June}, Pages = {231--243}, Pdf = {papers/ssp_col_final.pdf}, Title = {The Online Loop-free Stochastic Shortest-Path Problem}, Year = {2010}} @inproceedings{gyorgy2007, Abstract = {In this paper we consider an extension of the multi-armed bandit problem. In this generalized setting, the decision maker receives some side information, performs an action chosen from a finite set and then receives a reward. Unlike in the standard bandit settings, performing an action takes a random period of time. The environment is assumed to be stationary, stochastic and memoryless. The goal is to maximize the average reward received in one unit time, that is, to maximize the average rate of return. We consider the on-line learning problem where the decision maker initially does not know anything about the environment but must learn about it by trial and error. We propose an ``upper confidence bound''-style algorithm that exploits the structure of the problem. We show that the regret of this algorithm relative to the optimal algorithm that has perfect knowledge about the problem grows at the optimal logarithmic rate in the number of decisions and scales polynomially with the parameters of the problem.}, Author = {Gy{\"o}rgy, A. and Kocsis, L. and Szab{\'o}, I. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {IJCAI}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:53:39 -0700}, Keywords = {bandits, multi-armed bandits, semi-Markov decision processes, average payoff, theory}, Pages = {830--835}, Pdf = {papers/cbandit-ijcai07.pdf}, Title = {Continuous Time Associative Bandit Problems}, Year = {2007}} @techreport{gabor1998, Abstract = {We consider multi-criteria sequential decision making problems where the vector-valued evaluations are compared by a given, fixed total ordering. Conditions for the optimality of stationary policies and the Bellman optimality equation are given for a special, but important class of problems when the evaluation of policies can be computed for the criteria independently of each other. The analysis requires special care as the topology introduced by pointwise convergence and the order-topology introduced by the preference order are in general incompatible. Reinforcement learning algorithms are proposed and analyzed. Preliminary computer experiments confirm the validity of the derived algorithms. These type of multi-criteria problems are most useful when there are several optimal solutions to a problem and one wants to choose the one among these which is optimal according to another fixed criterion. Possible application in robotics and repeated games are outlined.}, Address = {Szeged, HU-6700}, Author = {G{\'a}bor, Z. and Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Institution = {``Attila J{\'o}zsef'' University, Research Group on Artificial Intelligence, JATE-MTA}, Keywords = {game playing, constrained MDPs, reinforcement learning, theory}, Note = {Revised on 05/04/2004}, Number = {TR-98-115}, Pdf = {papers/multi-rep97.ps.pdf}, Title = {Multi-criteria Reinforcement Learning}, Year = {1998}} @techreport{j.murvai1997, Abstract = {We propose a neural net solution for the recognition of the domain types of proteins, which is a hard and important problem in biology. We have found that using a clever preprocessing technique relatively small neural networks perform surprisingly well. The performances of the neural nets were measured by cross-validation and Hoeffding's inequality was utilized for the estimation of a confidence interval of the estimates.}, Address = {Szeged, HU-6700}, Author = {Murvai, J. and Szepesv{\'a}ri, {Cs}. and Bachrati, Cs. and Pongor, S.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Institution = {``Attila J{\'o}zsef'' University, Research Group on Artificial Intelligence}, Keywords = {bioinformatics, domain prediction, neural networks, application}, Number = {TR-98-117}, Owner = {Beata}, Pdf = {papers/RGAI-98-117.ps.pdf}, Timestamp = {2010.08.31}, Title = {Prediction of Protein Domain-Types by Backpropagation}, Year = {1997}} @article{j.murvai1999, Author = {Murvai, J. and Barta, E. and Vlahovicek, K. and Szepesv{\'a}ri, {Cs}. and Acatrinei, C. and Pongor, S.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Nucleic Acids Research}, Keywords = {bioinformatics, application}, Number = {1}, Owner = {Beata}, Pages = {257--259}, Timestamp = {2010.08.31}, Title = {The {SBASE} Protein Domain Sequence Library Release 6.0.}, Volume = {27}, Year = {1999}} @article{j.murvai2001, Abstract = {An artificial neural network (ANN) solution is described for the recognition of domains in protein sequences. A query sequence is first compared to a reference database of domain sequences by use of and the output data, encoded in the form of six parameters, are forwarded to feed-forward artificial neural networks with six input and six hidden units with sigmoidal transfer function. The recognition is based on the distribution of scores precomputed for the known domain groups in a database versus database comparison. Applications to the prediction of function are discussed.}, Author = {Murvai, J. and Vlahovicek, K. and Szepesv{\'a}ri, {Cs}. and Pongor, S.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Html = {http://www.genome.org/cgi/reprint/11/8/1410}, Journal = {Genome Research}, Keywords = {bioinformatics, domain prediction, neural networks, application}, Number = {8}, Owner = {Beata}, Pages = {1410--1417}, Pdf = {papers/protdompred.pdf}, Timestamp = {2010.08.31}, Title = {Prediction of Protein Functional Domains from Sequences Using Artificial Neural Networks}, Url = {http://www.genome.org/cgi/reprint/11/8/1410.pdf}, Volume = {11}, Year = {2001}, Bdsk-Url-1 = {http://www.genome.org/cgi/reprint/11/8/1410.pdf}} @inproceedings{k.kornel2005, Abstract = {Face recognition is a highly non-trivial classification problem since the input is high-dimensional and there are many classes with just a few examples per class. In this paper we propose using a recent algorithm -- Maximum Margin Discriminant Analysis (MMDA) -- to solve face recognition problems. MMDA is a feature extraction method that is derived from a set of sound principles: (i) each feature should maximize information transmission about the classification labels, (ii) only the decision boundary should determine the features and (iii) features should reveal independent information about the class labels. Previously, MMDA was shown to yield good performance scores on a number of standard benchmark problems. Here we show that MMDA is capable of finding good features in face recognition and performs very well provided it is preceded by an appropriate preprocessing phase. }, Author = {Korn{\'e}l, K. and Kocsor, A. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {HACIPPR-2005 (Joint Hungarian-Austrian Conference on Image Processing and Pattern Recognition)}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {vision, application, supervised feature extraction, supervised learning, classification}, Owner = {Beata}, Pages = {1-- 2}, Pdf = {papers/mmda-hacippr2005.pdf}, Timestamp = {2010.08.31}, Title = {Maximum Margin Discriminant Analysis based Face Recognition}, Year = {2005}} @article{kalmar1999, Abstract = {A massively parallel neural architecture is suggested for the approximate computation of the skeleton of a planar shape. Numerical examples demonstrate the robustness of the method. The architecture is constructed from self-organizing elements that allow the extension of the concept of skeletonization to areas remote to image processing.}, Author = {Kalm{\'a}r, {Zs}. and Marczell, {Zs}. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-04 12:23:43 -0600}, Doi = {10.1016/S0893-6080(98)00119-1}, Journal = {Neural Networks}, Keywords = {neural networks, image processing, application}, Pages = {163--173}, Pdf = {papers/marczell_SKELETON.pdf}, Title = {Parallel and Robust Skeletonization Built on Self-organizing Elements}, Volume = {12}, Year = {1999}, Bdsk-Url-1 = {http://dx.doi.org/10.1016/S0893-6080(98)00119-1}} @inproceedings{kalmar1997, Abstract = {The behaviour of reinforcement learning (RL) algorithms is best understood in completely observable, finite state- and action-space, discrete-time controlled Markov-chains. Robot-learning domains, on the other hand, are inherently infinite both in time and space, and moreover they are only partially observable. In this article we suggest a systematic method whose motivation comes from the desire to transform the task-to-be-solved into a finite-state, discrete-time, ``approximately'' Markovian task, which is completely observable too. The key idea is to break up the problem into subtasks and design controllers for each of the subtasks. Then operating conditions are attached to the controllers (together the controllers and their operating conditions which are called modules) and possible additional features are designed to facilitate observability. A new discrete time-counter is introduced at the ``module-level'' that clicks only when a change in the value of one of the features is observed. The approach was tried out on a real-life robot. Several RL algorithms were compared and it was found that a model-based approach worked best. The learnt switching strategy performed equally well as a handcrafted version. Moreover, the learnt strategy seemed to exploit certain properties of the environment which could not have been seen in advance, which predicted the promising possibility that a learnt controller might outperform a handcrafted switching strategy in the future.}, Author = {Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {Proceedings of the 6th European Workshop on Learning Robots}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {robotics, application, hierarchical reinforcement learning, reinforcement learning, macro learning, theory}, Pages = {22--32}, Pdf = {papers/ewlr97.ps.pdf}, Title = {Module Based Reinforcement Learning for a Real Robot}, Year = {1997}} @article{kalmar1995a, Abstract = {A model of adaptive autonomous agents, that (i) builds internal representation of events an d event relations, (ii) utilizes activation spreading for building dynamic concepts and (iii) makes use of the winner-take-all paradigm to come to a decision is extended by introducing generalization into the model. The generalization reduces memory requirements and improves performance in unseen scenes as it is indicated by computer simulations.}, Author = {Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Neural Network World}, Keywords = {reinforcement learning, theory}, Pages = {353--360}, Pdf = {papers/kalmar.dcmg.ps.pdf}, Title = {Generalized {D}ynamic {C}oncept {M}odel as a Route to Construct Adaptive Autonomous Agents}, Volume = {3}, Year = {1995}} @inproceedings{kalmar1994, Abstract = {In this article we present an extension of a previously defined model [8]. This model was introduced to govern an agent in a goal-oriented fashion in a previously unknown environment. The extension allows generalization in the input space, which reduces memory requirements as well as the time requirements of the algorithm.}, Address = {Orlando, Florida}, Author = {Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {Proc. of IEEE WCCI ICNN'94}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-04 14:51:53 -0600}, Keywords = {agent architecture, reinforcement learning}, Month = {June}, Pages = {1815--1817}, Pdf = {papers/kalmar.gen.pdf}, Publisher = {IEEE Inc.}, Title = {Generalization in an Autonomous Agent}, Url = {http://ieeexplore.ieee.org/iel2/3013/8558/00374432.pdf?arnumber=374432}, Volume = {3}, Year = {1994}, Bdsk-Url-1 = {http://ieeexplore.ieee.org/iel2/3013/8558/00374432.pdf?arnumber=374432}} @techreport{kalmarsze1999, Abstract = {It is known that a well-chosen set of macros makes it possible to considerably speed-up the solution of planning problems. Recently, macros have been considered in the planning framework, built on Markovian decision problem. However, so far no systematic approach was put forth to investigate the utility of macros within this framework. In this article we begin to systematically study this problem by introducing the concept of multi-task MDPs defined with a distribution over the tasks. We propose an evaluation criterion for macro-sets that is based on the expected planning speed-up due to the usage of a macro-set, where the expectation is taken over the set of tasks. The consistency of the empirical speed-up maximization algorithm is shown in the finite case. For acyclic systems, the expected planning speed-up is shown to be proportional to the amount of ``time-compression'' due to the macros. Based on these observations a heuristic algorithm for learning of macros is proposed. The algorithm is shown to return macros identical with those that one would like to design by hand in the case of a particular navigation like multi-task MDP. Some related questions, in particular the problem of breaking up MDPs into multiple tasks, factorizing MDPs and learning generalizations over actions to enhance the amount of transfer are also considered in brief at the end of the paper.}, Address = {Budapest 1121, Konkoly Th. M. u. 29-33, HUNGARY}, Author = {Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}.}, Date-Modified = {2010-09-02 13:09:15 -0600}, Institution = {Mindmaker Ltd.}, Keywords = {macro learning, reinforcement learning, lifelong learning, multitask learning}, Number = {TR-99-01}, Owner = {Beata}, Pdf = {papers/macro-tr99-01.ps.pdf}, Timestamp = {2010.08.30}, Title = {An Evaluation Criterion for Macro-Learning and Some Results}, Year = {1999}} @inproceedings{kocsis2005, Abstract = {A natural way to compare learning methods in non-stationary environments is to compare their regret. In this paper we consider the regret of algorithms in adversarial multi-armed bandit problems. We propose several methods to improve the performance of the baseline exponentially weighted average forecaster by changing the payoff-estimation methods. We argue that improved performance can be achieved by constructing payoff estimation methods that produce estimates with low variance. Our arguments are backed up by both theoretical and empirical results. In fact, our empirical results show that significant performance gains are possible over the baseline algorithm.}, Author = {Kocsis, L. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proceedings of the ECML-2005 Workshop on Reinforcement Learning in Non-Stationary Environments}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {online learning, adversarial setting, bandits}, Pdf = {papers/kocsis-ecml2005-ext.pdf}, Slide-Handout = {talks/ecml2005-rlw-handout.pdf}, Slides = {talks/ECML2005-rlw-slides.pdf}, Title = {Reduced-Variance Payoff Estimation in Adversarial Bandit Problems}, Year = {2005}} @article{kocsis2006, Abstract = {[..] The goal of this paper is twofold: (i) to introduce SPSA for the game programming community by putting it into a game-programming perspective, and (ii) to propose and discuss several methods that can be used to enhance the performance of SPSA. These methods include using common random numbers and antithetic variables, a combination of SPSA with RPROP, and the reuse of samples of previous performance evaluations. SPSA with the proposed enhancements was tested in some large-scale experiments on tuning the parameters of an opponent model, a policy and an evaluation function in our poker program, McRAISE. Whilst SPSA with no enhancements failed to make progress using the allocated resources, SPSA with the enhancements proved to be competitive with other methods, including TD-learning; increasing the average payor per game by as large as 0.19 times the size of the amount of the small bet. From the experimental study, we conclude that the use of an appropriately enhanced variant of SPSA for the optimisation of game program parameters is a viable approach, especially if no good alternative exist for the types of parameters considered.}, Author = {Kocsis, L. and Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Doi = {10.1007/s10994-006-6888-8}, Journal = {Machine Learning Journal}, Keywords = {SPSA, game playing, poker, Monte-Carlo methods, application, theory}, Pages = {249--286}, Pdf = {papers/mlj2005.pdf}, Title = {Universal Parameter Optimisation in Games Based on {SPSA}}, Volume = {63}, Year = {2006}, Bdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8}} @inproceedings{l.gerencser2005, Abstract = {P. Algoet and T. Cover characterized log-optimal portfolios in a stationary market without friction. There is no analogous result for markets with friction, of which a currency market is a typical example. In this paper we restrict ourselves to simple static strategies. The problem is then reduced to the analysis of products of random matrices, the top-Lyapunov exponent giving the growth rate. New insights to products of random matrices will be given and an algorithm for optimizing top-Lyapunov exponents will be presented together with some key steps of its analysis. Simulation results will also be given. [..]}, Author = {Gerencs{\'e}r, L. and R{\'a}sonyi, M. and Szepesv{\'a}ri, {Cs}. and V{\'a}g{\'o}, Zs.}, Booktitle = {CDC}, Date-Modified = {2010-11-25 00:56:15 -0700}, Keywords = {finance, control, gradient algorithm}, Owner = {Beata}, Pages = {1764--1769}, Pdf = {papers/cdc2005.pdf}, Timestamp = {2010.08.30}, Title = {Log-optimal Currency Portfolios and Control {L}yapunov Exponents}, Year = {2005}} @inproceedings{l.kocsis2006, Abstract = {Most game programs have a large number of parameters that are crucial for their performance. While tuning these parameters by hand is rather difficult, successful applications of automatic optimisation algorithms in game programs are known only for parameters that belong to certain components (e.g. evaluation-function parameters). The SPSA (Simultaneous Perturbation Stochastic Approximation) algorithm is an attractive choice for optimising any kind of parameters of a game program, both for its generality and its simplicity. It's disadvantage is that it can be very slow. In this article we propose several methods to speed up SPSA, in particular, the combination with RPROP, using common random numbers, antithetic variables and averaging. We test the resulting algorithm for tuning various types of parameters in two domains, poker and LOA. From the experimental study, we conclude that using SPSA is a viable approach for optimisation in game programs, especially if no good alternative exists for the types of parameters considered.}, Author = {Kocsis, L. and Szepesv{\'a}ri, {Cs}. and Winands, M.H.M.}, Booktitle = {ACG}, Date-Modified = {2010-11-25 00:59:25 -0700}, Keywords = {SPSA, game playing, poker, Monte-Carlo methods, application, theory}, Owner = {Beata}, Pages = {1-- 2}, Pdf = {papers/rspsa_acg.pdf}, Timestamp = {2010.08.30}, Title = {{RSPSA}: Enhanced Parameter Optimisation in Games}, Year = {2006}} @inproceedings{li2009, Abstract = {Options are important instruments in modern finance. In this paper, we investigate reinforcement learning (RL) methods---in particular, least-squares policy iteration (LSPI)---for the problem of learning exercise policies for American options. We develop finite-time bounds on the performance of the policy obtained with LSPI and compare LSPI and the fitted Q-iteration algorithm (FQI) with the Longstaff-Schwartz method (LSM), the standard least-squares Monte Carlo algorithm from the finance community. Our empirical results show that the exercise policies discovered by LSPI and FQI gain larger payoffs than those discovered by LSM, on both real and synthetic data. Furthermore, we find that for all methods the policies learned from real data generally gain similar payoffs to the policies learned from simulated data. Our work shows that solution methods developed in machine learning can advance the state-of-the-art in an important and challenging application area, while demonstrating that computational finance remains a promising area for future applications of machine learning methods.}, Author = {Li, Y. and Szepesv{\'a}ri, {Cs}. and Schuurmans, D.}, Booktitle = {AISTAT}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:54:06 -0700}, Keywords = {finance, reinforcement learning, theory, application}, Pages = {352--359}, Pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v5/li09d/li09d.pdf}, Title = {Learning Exercise Policies for {A}merican Options}, Url = {http://www.ics.uci.edu/~aistats/}, Volume = {5}, Year = {2009}, Bdsk-Url-1 = {http://www.ics.uci.edu/~aistats/}} @inproceedings{littman1996, Author = {Littman, M.L. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Keywords = {reinforcement learning, theory, asymptotic convergence, finite MDPs, stochastic approximation}, Pages = {310--318}, Pdf = {papers/ml96.ps.pdf}, Title = {A Generalized Reinforcement Learning Model: Convergence and Applications}, Year = {1996}} @inproceedings{m.french1997a, Abstract = {We consider systems satisfying a matching condition which are functionally known up to a L2 measure of uncertainty. A modified L2 performance measure is given, and the performance of a class of model based adaptive controllers is studied. An upper performance bound is derived in terms of the uncertainty measure and measures of the approximation error of the model. Asymptotic analyses of the bounds under increasing model size are undertaken, and sufficient conditions are given on the model that ensure the performance bounds are bounded independent of the model size.}, Address = {San Diego, California}, Author = {French, M.C. and Szepesv{\'a}ri, {Cs}. and Rogers, E.}, Booktitle = {CDC}, Date-Modified = {2010-11-25 00:56:23 -0700}, Keywords = {adaptive control, performance bounds, Lyapunov design, strict feedback, chain of integrators, matched uncertainty, theory, control, nonparametrics, stabilization}, Owner = {Beata}, Pages = {3046 - 3051}, Pdf = {papers/cdc97.ps.pdf}, Timestamp = {2010.08.31}, Title = {Uncertainty, Performance and Model Dependency in Approximate Adaptive Nonlinear Control}, Volume = {3}, Year = {1997}} @inproceedings{maei2009, Abstract = {We introduce the first temporal-difference learning algorithms that converge with smooth value function approximators, such as neural networks. Conventional temporal-difference (TD) methods, such as TD(lambda), Q-learning and Sarsa have been used successfully with function approximation in many applications. However, it is well known that off-policy sampling, as well as non-linear function approximation, can cause these algorithms to become unstable (i.e., the parameters of the approximator may diverge). Sutton et al (2009a,b) solved the problem of off-policy learning with linear TD algorithms by introducing a new objective function, related to the Bellman-error, and algorithms that perform stochastic gradient-descent on this function. In this paper, we generalize their work to non-linear function approximation. We present a Bellman error objective function and two gradient-descent TD algorithms that optimize it. We prove the asymptotic almost-sure convergence of both algorithms for any finite Markov decision process and any smooth value function approximator, under usual stochastic approximation conditions. The computational complexity per iteration scales linearly with the number of parameters of the approximator. The algorithms are incremental and are guaranteed to converge to locally optimal solutions.}, Author = {Maei, H.R. and Szepesv{\'a}ri, {Cs}. and Bhatnagar, S. and Silver, D. and Precup, D. and Sutton, R.S.}, Booktitle = {NIPS}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:51:13 -0700}, Keywords = {reinforcement learning, prediction, online learning, gradient algorithm, stochastic approximation, theory, neural networks, nonlinear function approximation, GTD}, Pages = {1204--1212}, Pdf = {papers/nonlin_gtdnips09-2.pdf}, Title = {Convergent Temporal-Difference Learning with Arbitrary Smooth Function Approximation}, Year = {2009}} @inproceedings{munos2005, Abstract = {In this paper we consider sampling based fitted value iteration for discounted, large (possibly infinite) state space, finite action Markovian Decision Problems where only a generative model of the transition probabilities and rewards is available. At each step the image of the current estimate of the optimal value function under a Monte-Carlo approximation to the Bellman-operator is projected onto some function space. PAC-style bounds on the weighted $L^p$-norm approximation error are obtained as a function of the covering number and the approximation power of the function space, the iteration number and the sample size.}, Author = {Munos, R. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:49:47 -0700}, Keywords = {batch learning, reinforcement learning, theory, performance bounds, nonparametrics}, Pages = {881---886}, Pdf = {papers/savi_icml2005.pdf}, Presentation = {talks/icml2005_talk.pdf}, Title = {Finite Time Bounds for Sampling Based Fitted Value Iteration}, Year = {2005}} @article{munos2008, Abstract = {This is the longer version of the ICML'2005 paper with all the proofs and some extra material. In this paper we develop a theoretical analysis of the performance of sampling-based fitted value iteration (FVI) to solve infinite state-space, discounted-reward Markovian decision processes (MDPs) under the assumption that a generative model of the environment is available. Our main results come in the form of finite-time bounds on the performance of two versions of sampling-based FVI.The convergence rate results obtained allow us to show that both versions of FVI are well behaving in the sense that by using a sufficiently large number of samples for a large class of MDPs, arbitrary good performance can be achieved with high probability.An important feature of our proof technique is that it permits the study of weighted $L^p$-norm performance bounds. As a result, our technique applies to a large class of function-approximation methods (e.g., neural networks, adaptive regression trees, kernel machines, locally weighted learning), and our bounds scale well with the effective horizon of the MDP. The bounds show a dependence on the stochastic stability properties of the MDP: they scale with the discounted-average concentrability of the future-state distributions. They also depend on a new measure of the approximation power of the function space, the inherent Bellman residual, which reflects how well the function space is ``aligned'' with the dynamics and rewards of the MDP.The conditions of the main result, as well as the concepts introduced in the analysis, are extensively discussed and compared to previous theoretical results.Numerical experiments are used to substantiate the theoretical findings.}, Author = {Munos, R. and Szepesv{\'a}ri, {Cs}.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {JMLR}, Keywords = {batch learning, reinforcement learning, theory, performance bounds, nonparametrics}, Owner = {Beata}, Pages = {815--857}, Pdf = {papers/munos08a.pdf}, Timestamp = {2010.08.30}, Title = {Finite Time Bounds for Fitted Value Iteration}, Url = {http://www.jmlr.org/papers/volume9/munos08a/munos08a.pdf}, Volume = {9}, Year = {2008}, Bdsk-Url-1 = {http://www.jmlr.org/papers/volume9/munos08a/munos08a.pdf}} @inproceedings{neu2007, Abstract = {In this paper we propose a novel gradient algorithm to learn a policy from an expert's observed behavior assuming that the expert behaves optimally with respect to some unknown reward function of a Markovian Decision Problem. The algorithm's aim is to find a reward function such that the resulting optimal policy matches well the expert's observed behavior. The main difficulty is that the mapping from the parameters to policies is both nonsmooth and highly redundant. Resorting to subdifferentials solves the first difficulty, while the second one is overcome by computing natural gradients. We tested the proposed method in two artificial domains and found it to be more reliable and efficient than some previous methods.}, Author = {Neu, G. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {UAI}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:54:55 -0700}, Keywords = {theory, application, reinforcement learning, apprenticeship learning, natural gradient, inverse reinforcement learning}, Pages = {295--302}, Pdf = {papers/uai2007-irl.pdf}, Title = {Apprenticeship Learning using Inverse Reinforcement Learning and Gradient Methods}, Year = {2007}} @article{neu2009, Abstract = {One major idea in structured prediction is to assume that the predictor computes its output by finding the maximum of a score function. The training of such a predictor can then be cast as the problem of finding weights of the score function so that the output of the predictor on the inputs matches the corresponding structured labels on the training set. A similar problem is studied in inverse reinforcement learning (IRL) where one is given an environment and a set of trajectories and the problem is to find a reward function such that an agent acting optimally with respect to the reward function would follow trajectories that match those in the training set. In this paper we show how IRL algorithms can be applied to structured prediction, in particular to parser training. We present a number of recent incremental IRL algorithms in a unified framework and map them to parser training algorithms. This allows us to recover some existing parser training algorithms, as well as to obtain a new one. The resulting algorithms are compared in terms of their sensitivity to the choice of various parameters and generalization ability on the Penn Treebank WSJ corpus.}, Author = {Neu, G. and Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Machine Learning}, Keywords = {theory, application, reinforcement learning, apprenticeship learning, natural gradient, structured prediction, inverse reinforcement learning, survey}, Pages = {303--337}, Pdf = {papers/MLJ-SISP-09.pdf}, Read = {1}, Title = {Training Parsers by Inverse Reinforcement Learning}, Volume = {77}, Year = {2009}} @inproceedings{olah1994, Abstract = {Nowadays artificial neural networks (ANNs) receive an increasing attention. However recent computer architectures do not allow yet the implementation of large ANNs. Thus it is an important question to examine how the learning time of ANNs scales respect to their size (and/or with the size of the tasks). Judd has introduced a computational framework for the learning problem (J. Judd, ``Neural Network Design and the Complexity of Learning.'' A Bradford Book, MIT Press, Cambridge, 1990) and proved, that learning in neural networks in general is too hard, i.e. in the worst case learning in neural networks is NP-complete. However, in his proof he restricts the domain of neural network architectures and tasks in such a way, that ``everyday'' neural network architectures, such as the one of the back-propagation algorithm, are excluded. Consequently Judd's proof does not tell anything for these types of networks. First we outline a thorough framework for loading. The framework enables to differentiate between loading problems at a finer level. Two theorems are presented about the complexity of learning for ``everyday'' ANN architectures. The first theorem says, that for extended binary tasks and in the worst-case, the loading problem is NP-complete, while the second says, that for binary tasks and basis LUF there exists a polynomial time algorithm. From these results it seems that the loading problem for ``everyday'' neural network is interesting from the mathematical point of view as it lies on the boundary of efficiently solvable problems.}, Address = {Orlando, Florida}, Author = {Ol{\'a}h, B. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proceedings of IEEE WCCI ICNN'94}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Keywords = {complexity analysis, NP-completeness, neural networks, loading problem, theory}, Month = {June}, Pages = {61--65}, Pdf = {papers/icnn94.ps.pdf}, Title = {Complexity of Learning: the Case of Everyday Neural Networks}, Volume = {1}, Year = {1994}} @inproceedings{poczos2009, Abstract = {An anytime algorithm is capable of returning a response to the given task at essentially any time; typically the quality of the response improves as the time increases. Here, we consider the challenge of learning when we should terminate such algorithms on each of a sequence of iid tasks, to optimize the expected average reward per unit time. We provide a system for addressing this challenge, which combines the global optimizer Cross- Entropy method with local gradient ascent. This paper theoretically investigates how far the estimated gradient is from the true gradient, then empirically demonstrates that this system is effective by applying it to a toy problem, as well as on a real-world face detection task.}, Author = {P{\'o}czos, B. and Abbasi-Yadkori, Y. and Szepesv{\'a}ri, {Cs}. and Greiner, R. and Sturtevant, N.}, Booktitle = {ICML}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:01 -0700}, Keywords = {reinforcement learning, application, pondering, gradient algorithm, REINFORCE, Cross-Entropy search}, Pages = {825--832}, Pdf = {papers/time_is_money-ICML09.pdf}, Title = {Learning When to Stop Thinking and Do Something!}, Year = {2009}} @inproceedings{poczos2010, Abstract = {We propose a new method for a non-parametric estimation of R{\'e}nyi and Shannon information for a multivariate distribution using a corresponding copula, a multivariate distribution over normalized ranks of the data. As the information of the distribution is the same as the negative entropy of its copula, our method estimates this information by solving a Euclidean graph optimization problem on the empirical estimate of the distribution's copula. Owing to the properties of the copula, we show that the resulting estimator of R{\'e}nyi information is strongly consistent and robust. Further, we demonstrate its applicability in image registration in addition to simulated experiments.}, Author = {P{\'o}czos, B. and Kirshner, S. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {AISTAT}, Date = {2010-05}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:53:26 -0700}, Keywords = {information theory, theory, mutual information}, Month = {May}, Pages = {852--859}, Pdf = {papers/information_AIstat_v4.pdf}, Title = {{REGO}: Rank-based Estimation of {R}{\'e}nyi Information using {E}uclidean Graph Optimization}, Volume = {9}, Year = {2010}} @inproceedings{ribeiro1996, Address = {Tehran, Iran}, Author = {Ribeiro, C. H. C. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proceedings of ISRF-IEE International Conference: Intelligent and Cognitive Systems, Neural Networks Symposium}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {reinforcement learning, theory, asymptotic convergence}, Pages = {32--36}, Title = {{Q}-learning Combined with Spreading: Convergence and Results}, Year = {1996}} @article{singh2000, Abstract = {An important application of reinforcement learning (RL) is to finite-state control problems and one of the most difficult problems in learning for control is balancing the exploration/exploitation tradeoff. Existing theoretical results for RL give very little guidance on reasonable ways to perform exploration. In this paper, we examine the convergence of single-step on-policy RL algorithms for control. On-policy algorithms cannot separate exploration from learning and therefore must confront the exploration problem directly. We prove convergence results for several related on-policy algorithms with both decaying exploration and persistent exploration. We also provide examples of exploration strategies that can be followed during learning that result in convergence to both optimal values and optimal policies.}, Author = {Singh, S. P. and Jaakkola, T. and Littman, M.L. and Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Doi = {10.1023/A:1007678930559}, Journal = {Machine Learning}, Keywords = {theory, reinforcement learning, asymptotic convergence, finite MDPs, stochastic approximation}, Number = {3}, Pages = {287--308}, Pdf = {papers/singh98convergence.pdf}, Title = {Convergence Results for Single-Step On-Policy Reinforcement-Learning Algorithms}, Volume = {38}, Year = {2000}, Bdsk-Url-1 = {http://dx.doi.org/10.1023/A:1007678930559}} @inproceedings{sorantin1998, Abstract = {The goal of this study was to develop a fully automated computer application for detection and classification of clustered mc using artificial neural nets (ANN's). Cases where additional investigations are necessary should be identified automatically, too.}, Author = {Sorantin, E. and Schmidt, F. and Mayer, H. and Winkler, P. and Szepesv{\'a}ri, {Cs}. and Graif, E. and Schuetz, E.}, Booktitle = {4th International Workshop on Digital Mammography}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:22 -0600}, Keywords = {application, neural networks, image processing, health informatics, clinical decision support}, Title = {Automated Detection and Classification of Microcalcifications in Mammograms using Artificial Neural Nets}, Url = {http://www.azn.nl/rrng/xray/digmam/iwdm98}, Year = {1998}, Bdsk-Url-1 = {http://www.azn.nl/rrng/xray/digmam/iwdm98}} @inproceedings{sutton2009, Abstract = {Sutton, Szepesvari and Maei (2009) recently introduced the first temporal-difference learning algorithm compatible with both linear function approximation and off-policy training, and whose complexity scales only linearly in the size of the function approximator. Although their gradient temporal difference (GTD) algorithm converges reliably, it can be very slow compared to conventional linear TD (on on-policy problems where TD is convergent), calling into question its practical utility. In this paper we introduce two new related algorithms with better convergence rates. The first algorithm, GTD2, is derived and proved convergent just as GTD was, but uses a different objective function and converges significantly faster (but still not as fast as conventional TD). The second new algorithm, linear TD with gradient correction, or TDC, uses the same update rule as conventional TD except for an additional term which is initially zero. In our experiments on small test problems and in a Computer Go application with a million features, the learning rate of this algorithm was comparable to that of conventional TD. This algorithm appears to extend linear TD to off-policy learning with no penalty in performance while only doubling computational requirements.}, Author = {Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\'a}ri, {Cs}. and Wiewiora, E.}, Booktitle = {ICML}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:50:04 -0700}, Keywords = {reinforcement learning, prediction, online learning, gradient algorithm, stochastic approximation, theory, function approximation, GTD2, TDC}, Pages = {993--1000}, Pdf = {papers/GTD-ICML09.pdf}, Title = {Fast Gradient-Descent Methods for Temporal-Difference Learning with Linear Function Approximation}, Year = {2009}} @techreport{szepesvari1996h, Abstract = {Reinforcement learning is the process by which an autonomous agent uses its experience interacting with an environment to improve its behavior. The Markov decision process (MDP) model is a popular way of formalizing the reinforcement-learning problem, but it is by no means the only way. In this paper, we show how many of the important theoretical results concerning reinforcement learning in MDPs extend to a generalized MDP model that includes MDPs, two-player games and MDPs under a worst-case optimality criterion as special cases. The basis of this extension is a stochastic-approximation theorem that reduces asynchronous convergence to synchronous convergence.}, Address = {Providence, RI}, Author = {Szepesv{\'a}ri, {Cs}. and Littman, M.L.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Institution = {Brown University, Department of Computer Science}, Keywords = {reinforcement learning, theory, asymptotic convergence, finite MDPs, stochastic approximation}, Month = {November}, Number = {CS-96-11}, Pdf = {papers/gmdp.ps.pdf}, Title = {Generalized {M}arkov Decision Processes: Dynamic-programming and reinforcement-learning algorithms}, Year = {1996}} @inproceedings{szepesvari1998c, Abstract = {We consider reinforcement learning methods for the solution of complex sequential optimization problems. In particular, the soundness of two methods proposed for the solution of partially observable problems will be shown. The first method suggests a state-estimation scheme and requires mild {\em a priori} knowledge, while the second method assumes that a significant amount of abstract knowledge is available about the decision problem and uses this knowledge to setup a macro-hierarchy to turn the partially observable problem into another one which can already be handled using methods worked out for observable problems. This second method is also illustrated with some experiments on a real-robot.}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proceedings of the 2nd Slovak Conference on Artificial Neural Networks (SCANN'98)}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Editor = {Hrehus, M.}, Keywords = {reinforcement learning, partial information, theory, MDP transformations}, Pages = {29--39}, Pdf = {papers/scann98.ps.pdf}, Title = {Reinforcement Learning: Theory and Practice}, Year = {1998}} @article{szepesvari2001, Abstract = {Monte-Carlo planning algorithms for planning in continuous state-space, discounted Markovian Decision Problems (MDPs) having a smooth transition law and a finite action space are considered. We prove various polynomial complexity results for the considered algorithms, improving upon several known bounds.}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {{AI} Communications}, Keywords = {continuous-space MDPs, Lipschitz-continuous MDPs, Monte-Carlo methods, performance bounds, theory, curse of dimensionality, complexity analysis}, Number = {3}, Pages = {163--176}, Pdf = {papers/aicom.pdf}, Title = {Efficient Approximate Planning in Continuous Space {M}arkovian Decision Problems}, Volume = {13}, Year = {2001}} @inproceedings{szepesvari1995b, Abstract = {In this article we propose a general framework for sequential decision making. The framework is based on the observation that the derivation of the optimal behaviour under various decision criteria follows the same pattern: the cost of policies can be decomposed into the successive application of an operator that defines the related dynamic programming algorithm and this operator describes completely the structure of the decision problem. We take this mapping (the so called one step lookahead (OLA) cost mapping) as our starting point. This enables the unified treatment of various decision criteria (e.g. the expected value criterion or the worst-case criterion). The main result of this article says that under minimal conditions optimal stationary policies are greedy w.r.t. the optimal cost function and vice versa. Based on this result we feel that former results on reinforcement learning can be transferred to other decision criteria provided that the decision criterion is decomposable by an appropriate mapping.}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICANN}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:58:26 -0700}, Keywords = {reinforcement learning, theory}, Pages = {165--170}, Pdf = {papers/szepes.greinf.ps.pdf}, Title = {General Framework for Reinforcement Learning}, Volume = {2}, Year = {1995}} @article{szepesvari1997c, Abstract = {A common technique in neurocontrol is that of controlling a plant by static state feedback using the plant's inverse dynamics, which is approximated through a learning process. It is well known that in this control mode even small approximation errors or, which is the same, small perturbations of the plant may lead to instability. Here, a novel approach is proposed to overcome the problem of instability by using the inverse dynamics both for the Static and for the error compensating Dynamic State feedback control. This scheme is termed SDS Feedback Control. It is shown that as long as the error of the inverse dynamics model is ``signproper'' the SDS Feedback Control is stable, i.e., the error of tracking may be kept small. The proof is based on a modification of Liapunov's second method. The problem of on-line learning of the inverse dynamics when using the controller simultaneously for both forward control and for dynamic feedback is dealt with, as are questions related to noise sensitivity and robust control of robotic manipulators. Simulations of a simplified sensorimotor loop serve to illustrate the approach.}, Author = {Szepesv{\'a}ri, {Cs}. and Cimmer, {Sz}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Neural Networks}, Keywords = {neural networks, theory, Lyapunov stability, adaptive control, control}, Pages = {1691--1708}, Pdf = {papers/sds-nn98.ps.pdf}, Title = {Neurocontroller using Dynamic State Feedback for Compensatory Control}, Volume = {10}, Year = {1997}} @incollection{szepesvari1997, Abstract = {It is rigorously shown that inverse-dynamics models can be used to stabilize plants of any order provided that the inverse-dynamic model is used in a mixed mode fashion, in that of a `Static and Dynamic' State-feedback (SDS) mode. When the resulting controller is used for tracking increasing the gain of the dynamic feedback decreases the tracking error. Yet another attractive feature of the SDS scheme is that the inverse-dynamics model can be tuned on-line by {\em any} adaptation mechanism without cancelling stability if the conditions of the non-adaptive stability theorem hold at any time instant. Computer simulations of the control of a chaotic bioreactor and a `realistic' robotic manipulator demonstrate the robustness of the approach. It is shown that SDS control will yield zero asymptotic error when controlling the bioreactor using an inverse-dynamics model which when used in a traditional mode would yield intolerably large errors. In the case of the robotic arm simulations the effects of perturbation and sampling frequency are investigated and the SDS control is compared with the non-adaptive computed torque method. A fully self-organizing associative neural network architecture that can be used to approximate the inverse-dynamics in the form of a Position-and-Direction-to-Action (PDA) map is also described. Similarities between the basal ganglia - thalamocortical loops and the SDS scheme are discussed and it is argued that the SDS scheme could be viewed as a model of higher order motor functions of these areas.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {Applications of Neural Adaptive Control Technology}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Editor = {Kalkkuhl, J. and Hunt, K.J. and Zbikowski, R. and Dzieli{\'n}sky, A.}, Keywords = {control, theory, adaptive control, manipulator control, bioreactor control, neural networks}, Pages = {151--197}, Pdf = {papers/nact97.ps.pdf}, Publisher = {World Scientific, Singapore}, Title = {Approximate Inverse-Dynamics based Robust Control using Static and Dynamic State Feedback}, Year = {1997}} @inproceedings{szepesvari1996b, Abstract = {It is proposed that controllers that approximate the inverse dynamics of the controlled plant can be used for on-line compensation of changes in the plant's dynamics. The idea is to use the very same controller in two modes at the same time: both for static and dynamic feedback. Implications for the learning of neurocontrollers are discussed. The proposed control mode relaxes the demand of precision and as a consequence, controllers that utilise direct associative learning by means of local function approximators may become more tractable in higher dimensional spaces.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {ICANN}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:58:31 -0700}, Keywords = {control, theory, adaptive control, manipulator control, bioreactor control, neural networks}, Pages = {697--702}, Pdf = {papers/szepes.icann96-fbc.ps.pdf}, Title = {Inverse Dynamics Controllers for Robust Control: Consequences for Neurocontrollers}, Year = {1996}} @inproceedings{szepesvari2004a, Abstract = {In this paper we introduce and study Shortest Path Discovery (SPD) problems, a generalization of shortest path problems: In SPD one is given a directed edge-weighted graph and the task is to find a the shortest path for fixed source and target nodes such that initially the edge-weights are unknown, but they can be queried. Querying the cost of an edge is expensive and hence the goal is to minimize the total number of edge cost queries executed. In this article we characterize some common properties of sound SPD algorithms, propose a particular algorithm that is shown to be sound and effective. Experimental results on real-world OCR task demonstrate the usefulness of the approach whereas the proposed algorithm is shown to yield a substantial speed-up of the recognition process.}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {AAAI}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 01:01:09 -0700}, Keywords = {online learning, theory, search, application, image processing, shortest path problem}, Pages = {550--555}, Pdf = {papers/szepes-aaai2004.pdf}, Title = {Shortest Path Discovery Problems: A Framework, Algorithms and Experimental Results}, Year = {2004}} @incollection{szepesvari2010, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {Wiley Encyclopedia of Operations Research}, Date = {2010-09}, Date-Added = {2010-08-28 20:43:59 -0600}, Date-Modified = {2010-11-25 00:42:56 -0700}, Keywords = {survey, reinforcement learning, temporal difference learning, stochastic approximation, two-timescale stochastic approximation, Monte-Carlo methods, simulation optimization, function approximation, stochastic gradient methods, least-squares methods, overfitting, bias-variance tradeoff, online learning, active learning, planning, simulation, PAC-learning, Q-learning, actor-critic methods, policy gradient, natural gradient}, Month = {September}, Publisher = {Wiley}, Title = {Reinforcement Learning Algorithms for MDPs}, Year = {2010}} @book{szepesvari2010c, Abstract = {Reinforcement learning is a learning paradigm concerned with learning to control a system so as to maximize a numerical performance measure that expresses a long-term objective. What distinguishes reinforcement learning from supervised learning is that only partial feedback is given to the learner about the learner's predictions. Further, the predictions may have long term effects through influencing the future state of the controlled system. Thus, time plays a special role. The goal in reinforcement learning is to develop efficient learning algorithms, as well as to understand the algorithms' merits and limitations. Reinforcement learning is of great interest because of the large number of practical applications that it can be used to address, ranging from problems in artificial intelligence to operations research or control engineering. In this book we focus on those algorithms of reinforcement learning which build on the powerful theory of dynamic programming. We give a fairly comprehensive catalog of learning problems, describe the core ideas, a large number of state of the art algorithms, followed by the discussion of their theoretical properties and limitations.}, Author = {Szepesv{\'a}ri, {Cs}.}, Date = {2010-07}, Date-Added = {2010-08-28 20:41:24 -0600}, Date-Modified = {2012-06-03 14:07:36 -0600}, Doi = {10.2200/S00268ED1V01Y201005AIM009}, Keywords = {reinforcement learning, temporal difference learning, stochastic approximation, two-timescale stochastic approximation, Monte-Carlo methods, simulation optimization, function approximation, stochastic gradient methods, least-squares methods, overfitting, bias-variance tradeoff, online learning, active learning, planning, simulation, PAC-learning, Q-learning, actor-critic methods, policy gradient, natural gradient}, Month = {July}, Pdf = {papers/RLAlgsInMDPs-lecture.pdf}, Publisher = {Morgan and Claypool}, Title = {Algorithms for Reinforcement Learning}, Url = {http://www.ualberta.ca/~szepesva/RLBook.html}, Year = {2010}, Bdsk-Url-1 = {http://www.ualberta.ca/~szepesva/RLBook.html}} @inproceedings{szepesvari1997h, Abstract = {We show that adaptive real time dynamic programming extended with the action selection strategy which chooses the best action according to the latest estimate of the cost function yields asymptotically optimal policies within finite time under the minimax optimality criterion. From this it follows that learning and exploitation do not conflict under this special optimality criterion. We relate this result to learning optimal strategies in repeated two-player zero-sum deterministic games.}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {ECML}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:57:05 -0700}, Editor = {Someren, M.{van} and Widmer, G.}, Keywords = {exploration vs. exploitation, reinforcement learning, theory}, Pages = {242--249}, Pdf = {papers/ecml97.ps.pdf}, Publisher = {Springer, Berlin}, Series = {Lecture Notes in Artificial Intelligence}, Title = {Learning and Exploitation do not Conflict Under Minimax Optimality}, Volume = {1224}, Year = {1997}} @misc{szepesvari2010b, Author = {Szepesv{\'a}ri, {Cs}.}, Date = {2010-03}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:43:42 -0700}, Keywords = {survey, multi-armed bandits, exploration vs. exploitation}, Month = {March}, Title = {Some Recent Algorithmic Results about the Exploration-vs-exploitation Dilemma}, Url = {http://rstb.royalsocietypublishing.org/content/362/1481/933.abstract/reply#royptb_el_55}, Year = {2010}, Bdsk-Url-1 = {http://rstb.royalsocietypublishing.org/content/362/1481/933.abstract/reply#royptb_el_55}} @techreport{szepesvari2009, Abstract = {This article presents a survey of reinforcement learning algorithms for Markov Decision Processes (MDP). In the first half of the article, the problem of value estimation is considered. Here we start by describing the idea of bootstrapping and temporal difference learning. Next, we compare incremental and batch algorithmic variants and discuss the impact of the choice of the function approximation method on the success of learning. In the second half, we describe methods that target the problem of learning to control an MDP. Here online and active learning are discussed first, followed by a description of direct and actor-critic methods.}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-03 00:43:23 -0600}, Institution = {Department of Computing Science, University of Alberta}, Keywords = {reinforcement learning, temporal difference learning, stochastic approximation, two-timescale stochastic approximation, Monte-Carlo methods, simulation optimization, function approximation, stochastic gradient methods, least-squares methods, overfitting, bias-variance tradeoff, online learning, active learning, planning, simulation, PAC-learning, Q-learning, actor-critic methods, policy gradient, natural gradient}, Number = {TR09-13}, Pdf = {http://www.cs.ualberta.ca/system/files/tech_report/2009/TR09-13.pdf}, Title = {Reinforcement Learning Algorithms for {MDP}s -- A Survey}, Year = {2009}} @phdthesis{szepesvari1998, Abstract = {Foreword: In this thesis the theory of optimal sequential decisions having a general recursive structure is investigated via an operator theoretical approach, the recursive structure (of both of the dynamics and the optimality criterion) being encoded into the so-called cost propagation operator. Decision problems like Markovian Decision Problems with expected or worst-case total discounted/undiscounted cost criterion; repeated zero-sum games such as Markov-games; or alternating Markov-games all admit such a recursive structure. Our setup has the advantage that it emphasizes their common properties as well as it points out some differences. The thesis consists of two parts, in the first part the model is assumed to be known while in the second one the models are to be explored. The setup of Part I is rather abstract but enables a unified treatment of a large class of sequential decision problems, namely the class when the total cost of decision policies is defined recursively by a so called cost propagation operator. Under natural monotonicity and continuity conditions the greedy policies w.r.t. the optimal cost-to-go function turn out to be optimal, due to the recursive structure. Part II considers the case when the models are unknown, and have to be explored and learnt. The price of considering unknown models is that here we have to restrict ourselves to models with an additive cost structure in order to obtain tractable learning situations. The almost sure convergence of the most frequently used algorithms proposed in the reinforcement learning community is proved. These algorithms are treated as multidimensional asynchronous stochastic approximation schemes and their convergence is deduced from the main theorem of the second part. The key of the method here is the so called rescaling property of certain homogeneous processes. A practical and verifiable sufficient condition for the convergence of on-line learning policies to an optimal policy is formulated and a convergence rate is established.}, Address = {Szeged, Aradi vrt. tere 1, HUNGARY, 6720}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {reinforcement learning, theory, asymptotic convergence}, Month = {September}, Pdf = {papers/thesis.ps.pdf}, School = {Bolyai Institute of Mathematics, University of Szeged}, Title = {Static and Dynamic Aspects of Optimal Sequential Decision Making}, Year = {1998}} @article{szepesvari1998a, Abstract = {In this article we prove the validity of the Bellman Optimality Equation and related results for sequential decision problems with a general recursive structure. The characteristic feature of our approach is that also non-Markovian policies are taken into account. The theory is motivated by some experiments with a learning robot.}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Acta Cybernetica}, Keywords = {sequential decision making, theory}, Number = {3}, Pages = {305--318}, Pdf = {papers/accyb97.ps.pdf}, Title = {Non-{M}arkovian Policies in Sequential Decision Problems}, Volume = {13}, Year = {1998}} @techreport{szepesvari1996e, Abstract = {We show that cascading Hebbian learning with any other convergent algorithm (called the forward algorithm) results in the convergence of the Hebbian weights to a stationary point where the Hebbian algorithm would converge if the weights of the forward algorithm had already converged. Further, it is shown that the convergence rate of the composite algorithm does not deteriorate because of the cascading. This result is a consequence of a more general theorem which is also stated and proved here, the proofs being based on a global Lipschitzian assumption. The theory is illustrated by a composite PCA-Hebbian architecture introduced by Micheals (Michaels, 1995).}, Address = {Szeged 6720, Aradi vrt tere 1., HUNGARY}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Institution = {Research Group on Artificial Intelligence, JATE-MTA}, Keywords = {stochastic approximation, two-timescale stochastic approximation, neural networks, PCA}, Month = {August}, Note = {e-mail: szepes@math.u-szeged.hu}, Number = {96-102}, Pdf = {papers/TR96-102.pdf}, Title = {Synthesis of Neural Networks: the Case of Cascaded {H}ebbians}, Year = {1996}} @inproceedings{szepesvari1994c, Abstract = {Reinforcement learning is a flourishing field of neural methods. It has a firm theoretical basis and has been proven powerful in many applications. A brain model based alternative to RL has been introduced in the literature: It integrates artificial neural networks (ANN) and knowledge based (KB) systems into one unit or agent for goal oriented problem solving. The agent may possess inherited and learnt ANN and KB subsystems. The agent has and develops ANN cues to the environment for dimensionality reduction in order to ease the problem of combinatorial explosion. A dynamic concept model was forwarded that builds cue-models of the phenomena in the world, designs action sets (concepts) and make them compete in a neural stage to come to a decision. The competition was implemented in the form of activation spreading (AS) and a winner-take-all mechanism. The efficiency of the algorithm has been demonstrated for several examples, however, the optimality of the algorithm have not yet been proven in general. Here, a restriction to Markov decision problems (MDP) shall be treated making possible to show the equivalence of a special AS and RL. The equivalence in this special case means, that DCM has all the advantages of RL, moreover it keeps track of more distinctions allowing faster convergence and generalization.}, Address = {Orlando, Florida}, Author = {Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proc. of IEEE WCCI ICNN'94}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-04 14:50:55 -0600}, Keywords = {reinforcement learning, theory}, Pages = {1738--1742}, Pdf = {papers/szepes.dcmopt.ps.pdf}, Publisher = {IEEE Inc.}, Title = {{D}ynamic {C}oncept {M}odel Learns Optimal Policies}, Volume = {3}, Year = {1994}} @mastersthesis{szepesvari1993d, Address = {Szeged, Hungary}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {manifold learning, neural networks, theory}, Note = {in Hungarian}, School = {Attila J{\'o}zsef University of Szeged}, Title = {The Role of Local Connections in Competitive Neural Networks: Collective Learning and Representation of Geometry}, Year = {1993}} @article{szepesvari1994d, Abstract = {It is shown that local, extended objects of a metrical topological space shape the receptive fields of competitive neurons to local filters. Self-organized topology learning is then solved with the help of Hebbian learning together with extended objects that provide unique information about neighborhood relations. A topographical map is deduced and is used to speed up further adaptation in a changing environment with the help of Kohonen type learning that teaches the neighbors of winning neurons as well.}, Author = {Szepesv{\'a}ri, {Cs}. and Bal{\'a}zs, L. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Neural Computation}, Keywords = {manifold learning, neural networks}, Number = {3}, Pages = {441--458}, Pdf = {papers/toplearn94.ps.pdf}, Title = {Topology Learning Solved by Extended Objects: a Neural Network Model}, Volume = {6}, Year = {1994}} @article{szepesvari1999, Abstract = {Reinforcement learning is the problem of generating optimal behavior in a sequential decision-making environment given the opportunity of interacting with it. Many algorithms for solving reinforcement-learning problems work by computing improved estimates of the optimal value function. We extend prior analyses of reinforcement-learning algorithms and present a powerful new theorem that can provide a unified analysis of value-function-based reinforcement-learning algorithms. The usefulness of the theorem lies in how it allows the asynchronous convergence of a complex reinforcement-learning algorithm to be proven by verifying that a simpler synchronous algorithm converges. We illustrate the application of the theorem by analyzing the convergence of Q-learning, model-based reinforcement learning, Q-learning with multi-state updates, Q-learning for Markov games, and risk-sensitive reinforcement learning.}, Author = {Szepesv{\'a}ri, {Cs}. and Littman, M.L.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Journal = {Neural Computation}, Keywords = {reinforcement learning, theory, asymptotic convergence, finite MDPs, stochastic approximation}, Pages = {2017--2059}, Pdf = {papers/nc-97-gmdp.ps.pdf}, Title = {A Unified Analysis of Value-Function-Based Reinforcement-Learning Algorithms}, Volume = {11}, Year = {1999}} @inproceedings{szepesvari1993a, Address = {Amsterdam, The Netherlands}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {ICANN}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:58:19 -0700}, Editor = {Gielen, S. and Kappen, B.}, Keywords = {agent architecture, reinforcement learning}, Month = {September}, Publisher = {Springer-Verlag, London}, Title = {Integration of {A}rtificial {N}eural {N}etworks and {D}ynamic {C}oncepts to an Adaptive and Self-organizing Agent}, Year = {1993}} @inproceedings{szepesvari1993c, Address = {Budapest, Hungary}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {3rd Conf. on Artificial Intelligence}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Editor = {Koch, P.}, Keywords = {agent architecture, reinforcement learning}, Month = {April 6--8}, Pages = {231--237}, Publisher = {John von Neumann Society for Computer Sciences}, Title = {Integration of {ANN} Cues, Dynamic {AI} Concepts and ANN Decision System into an Adaptive Self-Organizing Agent}, Year = {1993}} @inproceedings{szepesvari1994a, Abstract = {The geometric learning capabilities of a competitive neural network are studied. It is shown that the appropriate selection of a neural activity function enables the learning of the 3D geometry of a world, from two of the 2D projections of 3D extended objects}, Address = {Sorrento, Italy}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {ICANN}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:58:22 -0700}, Editor = {Marinaro, M. and Morasso, P.G.}, Keywords = {manifold learning, theory, neural networks}, Pages = {671--674}, Pdf = {papers/icann94.ps.pdf}, Publisher = {IEEE}, Title = {Self-organized Learning of 3 Dimensions}, Volume = {2}, Year = {1994}} @article{szepesvari1997d, Abstract = {We consider the problem of learning to control a plant with non-linear control characteristics and solving the path planning problem at the same time. The solution is based on a path planning model that designates a speed field to be tracked, the speed field being the gradient of the stationary solution of a diffusion process. Diffusion is simulated on an artificial neural network by spreading activation. Interneurons between neighboring discretizing neurons detect the strength of the activity flow and emit control signals to control neurons via modifiable connections. The proposed method may be used for learning redundant control problems. The architecture integrates reactive path planning and continuous motion control in a natural fashion.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Journal of Robotic Systems}, Keywords = {control, neural networks}, Number = {1}, Pages = {1--15}, Pdf = {papers/jrs98.ps.pdf}, Title = {Integrated Architecture for Motion-control and Path-planning}, Volume = {15}, Year = {1997}} @article{szepesvari1997f, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Nonlinear Analysis}, Keywords = {application, control, neural networks}, Number = {3}, Pages = {1669--1676}, Title = {High Precision Neurocontrol of a Chaotic Bioreactor}, Volume = {30}, Year = {1997}} @article{szepesvari1996c, Abstract = {The problems of controlling a plant while avoiding obstacles and experiencing perturbations in the plants dynamics are considered. It is assumed that the plant's dynamics is not known in advance. To solve this problem a self-organizing artificial neural network (ANN) solution is advanced here. The ANN consists of various parts. The first part discretizes the state space of the plant and also learns the geometry of the state space. The learnt geometrical relations are represented by lateral connections. These connections are utilized for planning a speed field, allowing collision free motion. The speed field is defined over the neural represention of the state space and is transformed into control signals with the help of interneurons associated with the lateral connections: connections between interneurons and control neurons encode the inverse dynamics of the plant. These connections are learnt during a direct system inverse identification process by Hebbian learning. Theoretical results and computer experiments show the robustness of approach.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Neural Network World}, Keywords = {neural networks, control}, Pages = {875--896}, Pdf = {papers/szepes.nnw1.ps.pdf}, Title = {Neurocontrol {I}: {S}elf-organizing Speed-field Tracking}, Volume = {6}, Year = {1996}} @article{szepesvari1996d, Abstract = {It is common that artificial neural networks (ANNs) are used for approximating the inverse dynamics of a plant. In the accompanying paper a self-organising ANN model for associative identification of the inverse dynamics was introduced. Here we propose the use of approximate inverse dynamic models for both Static and Dynamic State (SDS) feedback control. This compound controller is capable of high-precision control even when the inverse dynamics is just qualitatively modeled or the plant's dynamics is perturbed. Properties of the SDS Feedback Controller in learning the inverse dynamics as well as comparisons with other methods are discussed. An example is presented when a chaotic plant, a bioreactor, is controlled using the SDS Controller. We found that the SDS Controller can compensate model mismatches that otherwise would lead to an untolerably large error if a traditional controller were used.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Neural Network World}, Keywords = {neural networks, control, theory, manipulator control}, Pages = {897--920}, Pdf = {papers/szepes.nnw2.ps.pdf}, Title = {Neurocontrol {II}: {H}igh Precision Control Achieved using Approximate Inverse Dynamics Models}, Volume = {6}, Year = {1996}} @article{szepesvari1996g, Abstract = {This paper summarizes the recent advances in the theory of self-organizing development of approximate geometry representations based on the use of neural networks. Part of this work is based on the theoretical approach of (Szepesvari, 1993), which is different from that of (Martinetz, 1993) and also is somewhat more general. The Martinetz approach treats signals provided by artificial neuron-like entities whereas the present work uses the entities of the external world as its starting point. The relationship between the present work and the Martinetz approach will be detailed. We approach the problem of approximate geometry representations by first examining the problem of sensory fusion, i.e., the problem of fusing information from different transductors. A straightforward solution is the simultaneous discretization of the output of all transductors, which means the discretization of a space defined as the product of the individual transductor output spaces. However, the geometry relations are defined for the external world only, so it is still an open question how to define the metrics on the product of output spaces. It will be shown that simple Hebbian learning can result in the formation of a correct geometry representation. Some topological considerations will be presented to help us clarify the underlying concepts and assumptions. The mathematical framework gives rise to a corollary on the ``topographical mappings'' realized by Kohonen networks. In fact, the present work as well as (Martinetz, 1993) may be considered as a generalization of Kohonen's topographic maps. We develop topographic maps with self-organizing interneural connections.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Doi = {10.1016/0925-2312(95)00116-6}, Journal = {Neurocomputing}, Keywords = {manifold learning, theory, neural networks}, Pages = {267--287}, Pdf = {papers/fusion-neucing.pdf}, Title = {Approximate Geometry Representations and Sensory Fusion}, Volume = {12}, Year = {1996}, Bdsk-Url-1 = {http://dx.doi.org/10.1016/0925-2312(95)00116-6}} @article{szepesvari1994, Abstract = {The concepts are presented of a neural model based shell that integrates artificial neural networks (ANN) and artificial intelligence (AI) for problem solving. The shell may possess inherited and learnt ANN and AI subsystems. The shell has and develops (i) cues to the environment for dimensionality reduction, (ii) rules between elements of the reduced dimensional internal representation, (iii) `concepts' for achieving goals, i.e. for solving existing problems, (iv) the shell then causes the concepts to compete in order to come to a decision. The shell is designed for control problems, e.g. robotic tasks, control of plants, investment advisory systems, and may have very different ANN and AI parts. Here, we consider a simple robotic-like object in two dimensional space.}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-02 13:09:15 -0600}, Journal = {Adaptive Behavior}, Keywords = {agent architecture, reinforcement learning}, Number = {2}, Pages = {131--160}, Pdf = {papers/annai94.pdf}, Title = {Behavior of an Adaptive Self-organizing Autonomous Agent Working with Cues and Competing Concepts}, Volume = {2}, Year = {1994}} @inproceedings{szepesvari1993, Address = {Portland, Oregon, USA}, Author = {Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Booktitle = {Proc. of WCNN'93}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-09-04 14:53:49 -0600}, Day = {11--15}, Keywords = {agent architecture, reinforcement learning}, Month = {July}, Pages = {524--527}, Publisher = {Lawrence Erlbaum Associates, Inc. Publishers, New Jersey}, Title = {Integration of {A}rtificial {N}eural {N}etworks and {D}ynamic {C}oncepts to an Adaptive and Self-organizing Agent}, Volume = {1}, Year = {1993}} @techreport{szepesvari2000, Abstract = {We consider the convergence of a class of reinforcement learning algorithms combined with value function interpolation methods using the methods developed in (Littman and Szepesvari, 1996). As a special case of the obtained general results, for the first time, we prove the (almost sure) convergence of Q-learning when combined with value function interpolation in uncountable spaces.}, Address = {Budapest 1121, Konkoly Th. M. u. 29-33, HUNGARY}, Author = {Szepesv{\'a}ri, {Cs}.}, Date-Modified = {2010-09-04 14:48:33 -0600}, Institution = {Mindmaker Ltd.}, Keywords = {reinforcement learning, theory, asymptotic convergence, function approximation, stochastic approximation}, Number = {TR-2001-02}, Pdf = {papers/rlfapp.pdf}, Timestamp = {2010.08.30}, Title = {Convergent Reinforcement Learning with Value Function Interpolation}, Year = {2000}} @article{t.fomin1996, Abstract = {A fully self-organizing neural network approach to low-dimensional control problems is described. We consider the problem of learning to control an object and solving the path planning problem at the same time. Control is based on the path planning model that follows the gradient of the stationary solution of a diffusion process working in the state space. Previous works are extended by introducing a self-organizing multigrid-like discretizing structure to represent the external world. Diffusion is simulated within a recurrent neural network built on this multigrid system. The novelty of the approach is that the diffusion on the multigrid is fast. Moreover, the diffusion process on the multigrid fits well the requirements of the path planning: it accelerates the diffusion in large free space regions while still keeps the resolution in small bottleneck-like labyrinths along the path. Control is achieved in the usual way: associative learning identifies the inverse dynamics of the system in a direct fashion. To this end there are introduced interneurons between neighbouring discretizing units that detect the strength of the steady-state diffusion and forward control commands to the control neurons via modifiable connections. This architecture forms the Multigrid Position-and-Direction-to-Action (MPDA) map. The architecture integrates reactive path planning and continuous motion control. It is also shown that the scheme leads to population coding for the actual command vector.}, Author = {Fomin, T. and Rozgonyi, T. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {International Journal of Neural Systems}, Keywords = {control, neural networks}, Pages = {757--776}, Pdf = {papers/fomin.mo.ps.pdf}, Timestamp = {2010.08.31}, Title = {Self-organizing Multi-resolution Grid for Motion Planning and Control}, Volume = {7}, Year = {1996}} @inproceedings{torma2010, Abstract = {A Markov-chain Monte Carlo based algorithm is provided to solve the Simultaneous localization and mapping (SLAM) problem with general dynamics and observation model under open-loop control and provided that the map-representation is finite dimensional. To our knowledge this is the first provably consistent yet (close-to) practical solution to this problem. The superiority of our algorithm over alternative SLAM algorithms is demonstrated in a difficult loop closing situation.}, Author = {Torma, P. and Gy{\"o}rgy, A. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {AISTAT}, Date = {2010-05}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:53:14 -0700}, Keywords = {SLAM, robotics, application, theory, Monte-Carlo methods, MCMC}, Month = {May}, Pages = {605--612}, Pdf = {papers/mcmc-slam.pdf}, Title = {A {M}arkov-Chain {M}onte {C}arlo Approach to Simultaneous Localization and Mapping}, Volume = {9}, Year = {2010}} @inproceedings{torma2003, Abstract = {We consider the task of filtering dynamical systems observed in noise by means of sequential importance sampling when the proposal is restricted to the innovation components of the state. It is argued that the unmodified sequential importance sampling/resampling (SIR) algorithm may yield high variance estimates of the posterior in this case, resulting in poor performance when e.g. in visual tracking one tries to build a SIR algorithm on the top of the output of a color blob detector. A new method that associates the innovations sampled from the proposal and the particles in a separate computational step is proposed. The method is shown to outperform the unmodified SIR algorithm in a series of vision based object tracking experiments, both in terms of accuracy and robustness.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {AISTAT}, Date-Modified = {2010-11-25 00:54:01 -0700}, Editor = {C. M. Bishop, B. J. Frey}, Keywords = {vision, particle filtering, theory, application}, Owner = {Beata}, Pages = {271--278}, Pdf = {papers/sisrc.pdf}, Timestamp = {2010.08.31}, Title = {Sequential Importance Sampling for Visual Tracking Reconsidered}, Year = {2003}} @inproceedings{torma2005, Abstract = {An unsatisfactory property of particle filters is that they may become inefficient when the observation noise is low. In this paper we consider a simple-to-implement particle filter, called `LIS-based particle filter', whose aim is to overcome the above mentioned weakness. LIS-based particle filters sample the particles in a two-stage process that uses information of the most recent observation, too. Experiments with the standard bearings-only tracking problem indicate that the proposed new particle filter method is indeed a viable alternative to other methods.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {4th International Symposium on Image and Signal Processing and Analysis}, Date-Modified = {2010-11-25 00:00:51 -0700}, Keywords = {vision, particle filtering, theory, application}, Owner = {Beata}, Pages = {1--2}, Pdf = {papers/torma-ispa2005.pdf}, Timestamp = {2010.08.31}, Title = {On using Likelihood-adjusted Proposals in Particle Filtering: Local Importance Sampling}, Year = {2005}} @article{torma2006, Abstract = {In the low observation noise limit particle filters become inefficient. In this paper a simple-to-implement particle filter is suggested as a solution to this well-known problem. The proposed Local Importance Sampling based particle filters draw the particles' positions in a two-step process that makes use of both the dynamics of the system and the most recent observation. Experiments with the standard bearings-only tracking problem indicate that the proposed new particle filter method is indeed very successful when observations are reliable. Experiments with a high-dimensional variant of this problem further show that the advantage of the new filter grows with the increasing dimensionality of the system.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Journal of Multimedia}, Keywords = {vision, particle filtering, theory, application}, Pages = {32--43}, Pdf = {papers/jmm01013243.pdf}, Timestamp = {2010.08.31}, Title = {Local Importance Sampling: A Novel Technique to Enhance Particle Filtering}, Volume = {1}, Year = {2006}} @inproceedings{torma2004, Abstract = {Particle filters provide a means to track the state of an object even when the dynamics and the observations are non-linear/non-Gaussian. However, they can be very inefficient when the observation noise is low as compared to the system noise, as it is often the case in visual tracking applications. In this paper we propose a new two-stage sampling procedure to boost the performance of particle filters under this condition. The new procedure is shown to reduce the variance of the weights by means of a theoretical analysis. This result is confirmed in a series of synthetic and real-world visual tracking experiments.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ECCV}, Date-Modified = {2010-11-25 01:01:25 -0700}, Keywords = {vision, particle filtering, theory, application}, Owner = {Beata}, Pages = {16--27}, Pdf = {papers/lls-short.pdf}, Timestamp = {2010.08.31}, Title = {Enhancing Particle Filters using Local Likelihood Sampling}, Year = {2004}} @inproceedings{torma2003a, Abstract = {LS-N-IPS is an extension of the standard N-IPS particle filter (also known as CONDENSATION in the image processing literature). The modified algorithm adds local search to the baseline algorithm: in each time step the predictions are refined in a local search procedure that utilizes the most recent observation. A critical choice in the design of LS-N-IPS is the way the local search is implemented. Here, we introduce a method based on training artificial neural networks for implementing the local search. In experiments with real-life data (visual tracking) the method is shown to improve robustness and performance significantly, surpassing the performance of previous state-of-the-art algorithms.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {2003 IEEE International Symposium on Intelligent Signal Processing}, Date-Modified = {2010-09-02 13:09:15 -0600}, Keywords = {vision, particle filtering, theory, application}, Owner = {Beata}, Pdf = {papers/lsnipsneuro.pdf}, Timestamp = {2010.08.31}, Title = {Combining Local Search, Neural Networks and Particle Filters to Achieve Fast and Reliable Contour Tracking}, Year = {2003}} @inproceedings{torma2002, Abstract = {This paper presents a novel facial-pose tracking algorithm using LS-N-IPS (Local Search N-Interacting Particle System), an algorithm that has been introduced recently by the authors. LS-N-IPS is a probabilistic tracking algorithm that keeps track of a number of alternative hypotheses at any time, the particles. LS-N-IPS has three components: a dynamical model, an observation model, and a local-search operator that has to be chosen by the algorithm designer. The main novelty of the algorithm presented here is that it relies on shading information to guide the local search procedure, the idea of the search being to apply a sort-of Hough-transformation to the mapping that renders poses to images. Here we introduce this algorithm and report results on the task of tracking of synthetic facial masks using grey-scale image sequences.}, Address = {Budapest, Hungary}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proc. First Hungarian Computer Graphics and Geometry Conference}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {vision, particle filtering, application}, Owner = {Beata}, Pages = {10--16}, Pdf = {papers/MaskShadeLS.pdf}, Timestamp = {2010.08.31}, Title = {Towards Facial Pose Tracking}, Year = {2002}} @inproceedings{torma2001, Abstract = {A modification of N-IPS, a well known particle filter method is proposed and is shown to be more efficient than the baseline algorithm in the small sample size limit and when the observations are ``reliable''. The algorithm called LS-N-IPS adds local search to the baseline algorithm: in each time step the predictions are refined in a local search procedure that utilizes the most recent observation. The uniform stability of LS-N-IPS is studied and results of experiments are reported both for a simulated and a real-world (visual) tracking problem.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {5th IFAC Symposium on Nonlinear Control Systems (NOLCOS'01)}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {vision, particle filtering, theory, application}, Owner = {Beata}, Pages = {715--719}, Pdf = {papers/LSNIPS.ps.pdf}, Timestamp = {2010.08.31}, Title = {{LS-N-IPS}: an Improvement of Particle Filters by Means of Local Search}, Year = {2001}} @inproceedings{torma2001a, Abstract = {A recently introduced particle filtering method, called LS-N-IPS, is considered for tracking objects on video sequences. LS-N-IPS is a computationally efficient particle filter that performs better than the standard N-IPS particle filter, when observations are highly peaky, as it is the case of visual object tracking problems with good observation models. An implementation of LS-N-IPS that uses B-spline based contour models is proposed and is shown to perform very well as compared to similar state-of-the art tracking algorithms.}, Author = {Torma, P. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {Proc. Second International Symposium on Image and Signal Processing and Analysis (ISAP'01)}, Date-Modified = {2010-09-04 11:58:07 -0600}, Keywords = {vision, particle filtering, theory, application}, Owner = {Beata}, Pages = {277--282}, Pdf = {papers/VisTrLSNIPS.ps.pdf}, Timestamp = {2010.08.31}, Title = {Efficient Object Tracking in Video Sequences by means of {LS-N-IPS}}, Year = {2001}} @inproceedings{y.abbasi2010, Abstract = {We consider the problem of anytime planning in continuous state and action spaces with non-linear deterministic dynamics. We review the existing approaches to this problem and find no algorithms that both quickly find feasible solutions and also eventually approach optimal solutions with additional time. The state-of-the-art solution to this problem is the rapidly- exploring random tree (RRT) algorithm that quickly finds a feasible solution. However, the RRT algorithm does not return better results with additional time. We introduce RRT++ , an anytime extension of the basic RRT algorithm. We show that the new algorithm has desirable theoretical properties and experimentally show that it efficiently finds near optimal solutions.}, Author = {Abbasi-Yadkori, Y. and Modayil, J. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {IROS}, Date = {2010-10}, Date-Added = {2010-08-28 20:44:33 -0600}, Date-Modified = {2011-10-11 16:40:47 -0600}, Keywords = {robotics, Monte-Carlo tree search, motion planning, theory, application}, Month = {October}, Pages = {127--132}, Pdf = {papers/iros-2010.pdf}, Title = {Extending Rapidly-Exploring Random Trees for Asymptotically Optimal Anytime Motion Planning}, Year = {2010}} @inproceedings{yao2009, Abstract = {We consider linear prediction problems in a stochastic environment. The least mean square (LMS) algorithm is a well-known, easy to implement and computationally cheap solution to this problem. However, as it is well known, the LMS algorithm, being a stochastic gradient descent rule, may converge slowly. The recursive least squares (RLS) algorithm overcomes this problem, but its computational cost is quadratic in the problem dimension. In this paper we propose a two timescale stochastic approximation algorithm which, as far as its slower timescale is considered, behaves the same way as the RLS algorithm, while it is as cheap as the LMS algorithm. In addition, the algorithm is easy to implement. The algorithm is shown to give estimates that converge to the best possible estimate with probability one. The performance of the algorithm is tested in two examples and it is found that it may indeed offer some performance gain over the LMS algorithm.}, Author = {Yao, H. and Bhatnagar, S. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {CDC}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2012-06-06 21:38:17 -0600}, Keywords = {stochastic approximation, two-timescale stochastic approximation, linear prediction}, Pages = {1181--1188}, Pdf = {papers/cdc09_final.pdf}, Title = {{LMS-2}: Towards an Algorithm that is as Cheap as {LMS} and Almost as Efficient as {RLS}}, Year = {2009}} @inproceedings{yu2009, Abstract = {Surjectivity of linear projections between distribution families with fixed mean and covariance (regardless of dimension) is re-derived by a new proof. We further extend this property to distribution families that respect additional constraints, such as symmetry, unimodality and log-concavity. By combining our results with classic univariate inequalities, we provide new worst-case analyses for natural risk criteria arising in classification, optimization, portfolio selection and Markov decision processes.}, Author = {Yu, Y.-L. and Li, Y. and Schuurmans, D. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {NIPS}, Date-Added = {2010-08-28 17:38:14 -0600}, Date-Modified = {2010-11-25 00:51:09 -0700}, Keywords = {theory, value at risk, stochastic programming}, Pdf = {papers/CVAR-NIPS09.pdf}, Title = {A General Projection Property for Distribution Families}, Year = {2009}} @inproceedings{z.gabor1998, Abstract = {We consider multi-criteria sequential decision making problems where the vector-valued evaluations are compared by a given, fixed total ordering. Conditions for the optimality of stationary policies and the Bellman optimality equation are given for a special, but important class of problems when the evaluation of policies can be computed for the criteria independently of each other. The analysis requires special care as the topology introduced by pointwise convergence and the order-topology introduced by the preference order are in general incompatible. Reinforcement learning algorithms are proposed and analyzed. Preliminary computer experiments confirm the validity of the derived algorithms. These type of multi-criteria problems are most useful when there are several optimal solutions to a problem and one wants to choose the one among these which is optimal according to another fixed criterion. Possible application in robotics and repeated games are outlined.}, Author = {G{\'a}bor, Z. and Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}.}, Booktitle = {ICML}, Date-Modified = {2010-09-02 13:09:16 -0600}, Keywords = {reinforcement learning, theory, constrained MDPs}, Note = {Revised on 05/04/2004}, Owner = {Beata}, Pdf = {papers/multi98.ps.pdf}, Timestamp = {2010.08.30}, Title = {Multi-criteria Reinforcement Learning}, Year = {1998}} @article{zs.kalmar1998a, Abstract = {The behavior of reinforcement learning (RL) algorithms is best understood in completely observable, discrete-time controlled Markov chains with finite state and action spaces. In contrast, robot-learning domains are inherently continuous both in time and space, and moreover are partially observable. Here we suggest a systematic approach to solve such problems in which the available qualitative and quantitative knowledge is used to reduce the complexity of learning task. The steps of the design process are to: i) decompose the task into subtasks using the qualitative knowledge at hand; ii) design local controllers to solve the subtasks using the available quantitative knowledge and iii) learn a coordination of these controllers by means of reinforcement learning. It is argued that the approach enables fast, semi-automatic, but still high quality robot-control as no fine-tuning of the local controllers is needed. The approach was verified on a non-trivial real-life robot task. Several RL algorithms were compared by ANOVA and it was found that the model-based approach worked significantly better than the model-free approach. The learnt switching strategy performed comparably to a handcrafted version. Moreover, the learnt strategy seemed to exploit certain properties of the environment which were not foreseen in advance, thus supporting the view that adaptive algorithms are advantageous to non-adaptive ones in complex environments.}, Author = {Kalm{\'a}r, {Zs}. and Szepesv{\'a}ri, {Cs}. and L{\"o}rincz, A.}, Date-Modified = {2010-09-02 13:09:16 -0600}, Journal = {Machine Learning}, Keywords = {robotics, application, hierarchical reinforcement learning, reinforcement learning, macro learning, theory}, Note = {Also appeared as: Z. Kalm{\'a}r, C. Szepesv{\'a}ri, and A. Lorincz. Module-based reinforcement learning: Experiments with a real robot. Autonomous Robots, 5:273--295, 1998.}, Owner = {Beata}, Pages = {1-- 2}, Pdf = {papers/ml-98.ps.pdf}, Timestamp = {2010.08.30}, Title = {Module-Based Reinforcement Learning: Experiments with a Real Robot}, Volume = {31}, Year = {1998}}