# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
1 2 3
# Load dataset via TensorFlow Datasets data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True) train_examples = info.splits['train'].num_examples
1 2
# MNLI expects either validation_matched or validation_mismatched valid_examples = info.splits['validation'].num_examples
1 2
# Prepare dataset for GLUE as a tf.data.Dataset instance train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
1 2 3 4
# MNLI expects either validation_matched or validation_mismatched valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK) train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
1 2 3 4 5
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
1 2 3 4
if num_labels == 1: loss = tf.keras.losses.MeanSquaredError() else: loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
import tensorflow as tf import tensorflow_datasets from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features
1 2 3
# Load dataset, tokenizer, model from pretrained model/vocabulary tokenizer = BertTokenizer.from_pretrained('glue_mrpc_save') model = TFBertForSequenceClassification.from_pretrained('glue_mrpc_save')
1 2 3
TFDS_TASK = "mrpc" # Load dataset via TensorFlow Datasets data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/home/b418a/tensorflow_datasets/glue/mrpc/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /home/b418a/tensorflow_datasets/glue/mrpc/0.0.2
1 2
# MNLI expects either validation_matched or validation_mismatched valid_examples = info.splits['validation'].num_examples
1 2 3
# MNLI expects either validation_matched or validation_mismatched valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TFDS_TASK) valid_dataset = valid_dataset.batch(64)
1 2 3 4 5 6
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')