% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/model.R
\name{tabnet_config}
\alias{tabnet_config}
\title{Configuration for TabNet models}
\usage{
tabnet_config(
  batch_size = 1024^2,
  penalty = 0.001,
  clip_value = NULL,
  loss = "auto",
  epochs = 5,
  drop_last = FALSE,
  decision_width = NULL,
  attention_width = NULL,
  num_steps = 3,
  feature_reusage = 1.3,
  mask_type = "sparsemax",
  mask_topk = NULL,
  virtual_batch_size = 256^2,
  valid_split = 0,
  learn_rate = 0.02,
  optimizer = "adam",
  lr_scheduler = NULL,
  lr_decay = 0.1,
  step_size = 30,
  checkpoint_epochs = 10,
  cat_emb_dim = 1,
  num_independent = 2,
  num_shared = 2,
  num_independent_decoder = 1,
  num_shared_decoder = 1,
  momentum = 0.02,
  pretraining_ratio = 0.5,
  verbose = FALSE,
  device = "auto",
  importance_sample_size = NULL,
  early_stopping_monitor = "auto",
  early_stopping_tolerance = 0,
  early_stopping_patience = 0L,
  num_workers = 0L,
  skip_importance = FALSE
)
}
\arguments{
\item{batch_size}{(int) Number of examples per batch, large batch sizes are
recommended. (default: 1024^2)}

\item{penalty}{This is the extra sparsity loss coefficient as proposed
in the original paper. The bigger this coefficient is, the sparser your model
will be in terms of feature selection. Depending on the difficulty of your
problem, reducing this value could help (default 1e-3).}

\item{clip_value}{If a num is given this will clip the gradient at
clip_value. Pass \code{NULL} to not clip.}

\item{loss}{(character or function) Loss function for training (default to mse
for regression and cross entropy for classification)}

\item{epochs}{(int) Number of training epochs.}

\item{drop_last}{(logical) Whether to drop last batch if not complete during
training}

\item{decision_width}{(int) Width of the decision prediction layer. Bigger values gives
more capacity to the model with the risk of overfitting. Values typically
range from 8 to 64.}

\item{attention_width}{(int) Width of the attention embedding for each mask. According to
the paper n_d = n_a is usually a good choice. (default=8)}

\item{num_steps}{(int) Number of steps in the architecture
(usually between 3 and 10)}

\item{feature_reusage}{(num) This is the coefficient for feature reusage in the masks.
A value close to 1 will make mask selection least correlated between layers.
Values range from 1 to 2.}

\item{mask_type}{(character) Final layer of feature selector in the attentive_transformer
block, either \code{"sparsemax"}, \code{"entmax"} or \code{"entmax15"}.Defaults to \code{"sparsemax"}.}

\item{mask_topk}{(int) mask sparsity top-k for \code{sparsemax15} and \code{entmax15.} See \code{\link[=entmax15]{entmax15()}} for detail.}

\item{virtual_batch_size}{(int) Size of the mini batches used for
"Ghost Batch Normalization" (default=256^2)}

\item{valid_split}{In [0, 1). The fraction of the dataset used for validation.
(default = 0 means no split)}

\item{learn_rate}{initial learning rate for the optimizer.}

\item{optimizer}{the optimization method. currently only \code{"adam"} is supported,
you can also pass any torch optimizer function.}

\item{lr_scheduler}{if \code{NULL}, no learning rate decay is used. If "step"
decays the learning rate by \code{lr_decay} every \code{step_size} epochs. If "reduce_on_plateau"
decays the learning rate by \code{lr_decay} when no improvement after \code{step_size} epochs.
It can also be a \link[torch:lr_scheduler]{torch::lr_scheduler} function that only takes the optimizer
as parameter. The \code{step} method is called once per epoch.}

\item{lr_decay}{multiplies the initial learning rate by \code{lr_decay} every
\code{step_size} epochs. Unused if \code{lr_scheduler} is a \code{torch::lr_scheduler}
or \code{NULL}.}

\item{step_size}{the learning rate scheduler step size. Unused if
\code{lr_scheduler} is a \code{torch::lr_scheduler} or \code{NULL}.}

\item{checkpoint_epochs}{checkpoint model weights and architecture every
\code{checkpoint_epochs}. (default is 10). This may cause large memory usage.
Use \code{0} to disable checkpoints.}

\item{cat_emb_dim}{Size of the embedding of categorical features. If int, all categorical
features will have same embedding size, if list of int, every corresponding feature will have
specific embedding size.}

\item{num_independent}{Number of independent Gated Linear Units layers at each step of the encoder.
Usual values range from 1 to 5.}

\item{num_shared}{Number of shared Gated Linear Units at each step of the encoder. Usual values
at each step of the decoder. range from 1 to 5}

\item{num_independent_decoder}{For pretraining, number of independent Gated Linear Units layers
Usual values range from 1 to 5.}

\item{num_shared_decoder}{For pretraining, number of shared Gated Linear Units at each step of the
decoder. Usual values range from 1 to 5.}

\item{momentum}{Momentum for batch normalization, typically ranges from 0.01
to 0.4 (default=0.02)}

\item{pretraining_ratio}{Ratio of features to mask for reconstruction during
pretraining.  Ranges from 0 to 1 (default=0.5)}

\item{verbose}{(logical) Whether to print progress and loss values during
training.}

\item{device}{the device to use for training. "cpu" or "cuda". The default ("auto")
uses  to "cuda" if it's available, otherwise uses "cpu".}

\item{importance_sample_size}{sample of the dataset to compute importance metrics.
If the dataset is larger than 1e5 obs we will use a sample of size 1e5 and
display a warning.}

\item{early_stopping_monitor}{Metric to monitor for early_stopping. One of "valid_loss", "train_loss" or "auto" (defaults to "auto").}

\item{early_stopping_tolerance}{Minimum relative improvement to reset the patience counter.
0.01 for 1\% tolerance (default 0)}

\item{early_stopping_patience}{Number of epochs without improving until stopping training. (default=5)}

\item{num_workers}{(int, optional): how many subprocesses to use for data
loading. 0 means that the data will be loaded in the main process.
(default: \code{0})}

\item{skip_importance}{if feature importance calculation should be skipped (default: \code{FALSE})}
}
\value{
A named list with all hyperparameters of the TabNet implementation.
}
\description{
Configuration for TabNet models
}
\examples{
\dontshow{if ((torch::torch_is_installed() && require("modeldata"))) withAutoprint(\{ # examplesIf}
data("ames", package = "modeldata")

# change the model config for an faster ignite optimizer
config <- tabnet_config(optimizer = torch::optim_ignite_adamw)

## Single-outcome regression using formula specification
fit <- tabnet_fit(Sale_Price ~ ., data = ames, epochs = 1, config = config)
\dontshow{\}) # examplesIf}
}
