Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

27 verify gpu #29

Merged
merged 5 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions word-prediction-kb-bert/pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion word-prediction-kb-bert/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ description = "A sparv plugin for computing word neighbours using a BERT model."
authors = [
{ name = "Kristoffer Andersson", email = "[email protected]" },
]
dependencies = ["sparv-pipeline >=5.2.0", "transformers>=4.34.1"]
dependencies = [
"sparv-pipeline >=5.2.0",
"transformers>=4.34.1",
"torch>=2.3.1",
]
license = "MIT"
readme = "README.md"
requires-python = ">= 3.8.1,<3.12"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from dataclasses import dataclass
from typing import Optional, Tuple

from sparv.api import ( # type: ignore [import-untyped]
Annotation,
Config,
Expand All @@ -9,10 +6,6 @@
annotator,
get_logger,
)
from transformers import ( # type: ignore [import-untyped]
BertForMaskedLM,
BertTokenizer,
)

from sbx_word_prediction_kb_bert.predictor import TopKPredictor

Expand All @@ -39,28 +32,6 @@
TOK_SEP = " "


@dataclass
class HuggingfaceModel:
model_name: str
model_revision: str
tokenizer_name: Optional[str] = None
tokenizer_revision: Optional[str] = None

def tokenizer_name_and_revision(self) -> Tuple[str, str]:
if tokenizer_name := self.tokenizer_name:
return tokenizer_name, self.tokenizer_revision or "main"
else:
return self.model_name, self.model_revision


MODELS = {
"kb-bert": HuggingfaceModel(
model_name="KBLab/bert-base-swedish-cased",
model_revision="c710fb8dff81abb11d704cd46a8a1e010b2b022c",
)
}


@annotator("Word prediction tagging with a masked Bert model", language=["swe"])
def predict_words__kb_bert(
out_prediction: Output = Output(
Expand All @@ -86,18 +57,8 @@ def predict_words__kb_bert(
raise SparvErrorMessage(
f"'sbx_word_prediction_kb_bert.num_decimals' must contain an 'int' got: '{num_decimals_str}'" # noqa: E501
) from exc
tokenizer_name, tokenizer_revision = MODELS["kb-bert"].tokenizer_name_and_revision()

tokenizer = BertTokenizer.from_pretrained(
tokenizer_name, revision=tokenizer_revision
)
model = BertForMaskedLM.from_pretrained(
MODELS["kb-bert"].model_name, revision=MODELS["kb-bert"].model_revision
)

predictor = TopKPredictor(
model=model,
tokenizer=tokenizer,
num_decimals=num_decimals,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from sparv import api as sparv_api # type: ignore [import-untyped]
from transformers import ( # type: ignore [import-untyped]
BertForMaskedLM,
BertTokenizer,
FillMaskPipeline,
)

logger = sparv_api.get_logger(__name__)

SCORE_FORMATS = {
1: ("{:.1f}", lambda s: s.endswith(".0")),
2: ("{:.2f}", lambda s: s.endswith(".00")),
Expand All @@ -17,11 +26,49 @@


class TopKPredictor:
def __init__(self, *, tokenizer, model, num_decimals: int = 3) -> None:
self.tokenizer = tokenizer
self.model = model
def __init__(
self,
*,
tokenizer: Optional[BertTokenizer] = None,
model: Optional[BertForMaskedLM] = None,
num_decimals: int = 3,
) -> None:
self.tokenizer = tokenizer or self._default_tokenizer()
self.model = model or self._default_model()
self.num_decimals = num_decimals
self.pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer)
self.pipeline = FillMaskPipeline(model=self.model, tokenizer=self.tokenizer)

@classmethod
def _default_model(cls) -> BertForMaskedLM:
if torch.cuda.is_available():
logger.info("Using GPU (cuda)")
dtype = torch.float16
else:
logger.warning("Using CPU, is cuda available?")
dtype = torch.float32
model = BertForMaskedLM.from_pretrained(
MODELS["kb-bert"].model_name,
revision=MODELS["kb-bert"].model_revision,
torch_dtype=dtype,
device_map=(
"auto"
if torch.cuda.is_available() and torch.cuda.device_count() > 1
else None
),
)
if torch.cuda.is_available() and torch.cuda.device_count() == 1:
model = model.cuda()
return model

@classmethod
def _default_tokenizer(cls) -> BertTokenizer:
tokenizer_name, tokenizer_revision = MODELS[
"kb-bert"
].tokenizer_name_and_revision()

return BertTokenizer.from_pretrained(
tokenizer_name, revision=tokenizer_revision
)

def get_top_k_predictions(self, text: str, k: int = 5) -> str:
tokenized_inputs = self.tokenizer(text)
Expand Down Expand Up @@ -69,3 +116,25 @@ def _run_pipeline(self, text, k) -> str:
return f"|{predictions_str}|" if predictions_str else "|"
else:
return "|"


@dataclass
class HuggingfaceModel:
model_name: str
model_revision: str
tokenizer_name: Optional[str] = None
tokenizer_revision: Optional[str] = None

def tokenizer_name_and_revision(self) -> Tuple[str, str]:
if tokenizer_name := self.tokenizer_name:
return tokenizer_name, self.tokenizer_revision or "main"
else:
return self.model_name, self.model_revision


MODELS = {
"kb-bert": HuggingfaceModel(
model_name="KBLab/bert-base-swedish-cased",
model_revision="c710fb8dff81abb11d704cd46a8a1e010b2b022c",
)
}
Empty file.
14 changes: 1 addition & 13 deletions word-prediction-kb-bert/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
import pytest
from sbx_word_prediction_kb_bert import (
MODELS,
TopKPredictor,
)
from transformers import ( # type: ignore [import-untyped]
BertForMaskedLM,
BertTokenizer,
)


@pytest.fixture(scope="session")
def kb_bert_predictor() -> TopKPredictor:
tokenizer_name, tokenizer_revision = MODELS["kb-bert"].tokenizer_name_and_revision()
tokenizer = BertTokenizer.from_pretrained(
tokenizer_name, revision=tokenizer_revision
)
model = BertForMaskedLM.from_pretrained(
MODELS["kb-bert"].model_name, revision=MODELS["kb-bert"].model_revision
)
return TopKPredictor(model=model, tokenizer=tokenizer)
return TopKPredictor()
2 changes: 1 addition & 1 deletion word-prediction-kb-bert/tests/test_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_rounding(kb_bert_predictor: TopKPredictor) -> None:


def remove_scores(actual):
return "|".join((x.split(":")[0] for x in actual.split("|")))
return "|".join(x.split(":")[0] for x in actual.split("|"))


def test_long_text(kb_bert_predictor: TopKPredictor) -> None:
Expand Down