Skip to content

Commit

Permalink
sin: ilo Voting li ken sewi e nimi anpa lon poka pi nimi pona mute
Browse files Browse the repository at this point in the history
  • Loading branch information
gregdan3 committed Sep 24, 2024
1 parent 16c4bf7 commit ccc03d6
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 2 deletions.
68 changes: 67 additions & 1 deletion src/sonatoki/Scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# LOCAL
from sonatoki.types import Number, Scorecard
from sonatoki.Filters import Filter
from sonatoki.Filters import Pass, Filter


class Scorer(ABC):
Expand Down Expand Up @@ -112,6 +112,67 @@ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
return total_score / max_score if max_score else 0


class Voting(Scaling):
"""Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
a filter matches, with the first filter scoring a 1. However, after all
scores are derived, each token scoring 0 is given a is given an opportunity
to score based on its nearest 3 neighbors.
If created with a Filter, tokens must also pass that filter to be
considered for voting.
"""

prereq: Type[Filter] = Pass
threshold: int = 0

def __new__(cls, filter: Type[Filter], threshold_: int):
class AnonVoting(Voting):
prereq = filter
threshold = threshold_

return AnonVoting

@classmethod
@override
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
if not tokens:
return 1

if len(tokens) < 4:
return super().score(tokens, filters)

len_filters = len(filters)
max_score = len(tokens) * len_filters

# score_token only emits ints
# but the averaging emits floats
# it doesn't really matter as long as no score exceeds len_filters
scores: List[Number] = []
for token in tokens:
score = cls.score_token(token, filters, len_filters)
scores.append(score)

# only consider scores from before voting
copied_scores = scores[:]
for i, (token, score) in enumerate(zip(tokens, copied_scores)):
if score > cls.threshold:
continue
if not cls.prereq.filter(token):
continue

# TODO: this is kinda dumb.
# we want to get exactly 3 neighbors, favoring 2 before and 1 after
# the way i'm doing this is both bad and slow as hell
start = max(i - 2, 0)
end = min(i + 1, len(scores) - 1)
neighbors = copied_scores[start:i] + copied_scores[i + 1 : end + 1]
scores[i] = sum(neighbors) / len(neighbors)

total_score = sum(scores)

return total_score / max_score if max_score else 0


class SoftPassFail(Soften, PassFail):
"""Same as `PassFail`, but shorter messages are subject to less harsh
scoring."""
Expand All @@ -122,6 +183,11 @@ class SoftScaling(Soften, Scaling):
scoring."""


class SoftVoting(Soften, Voting):
"""Same as `Voting`, but shorter messages are subject to less harsh
scoring."""


class SentenceScorer(ABC):
@classmethod
@abstractmethod
Expand Down
12 changes: 11 additions & 1 deletion tests/test_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@
PunctuationRe,
NimiLinkuCommon,
)
from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
from sonatoki.Scorers import (
Scorer,
Voting,
Scaling,
PassFail,
SoftVoting,
SoftScaling,
SoftPassFail,
)

# FILESYSTEM
from .test_utils import token_strategy
Expand All @@ -41,6 +49,8 @@
SoftPassFail,
Scaling,
SoftScaling,
Voting,
SoftVoting,
]


Expand Down

0 comments on commit ccc03d6

Please sign in to comment.