From e588141922ce8e03d6c6d2baec50dfec6ca8d4cf Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Fri, 7 Jun 2024 14:23:03 -0700 Subject: [PATCH 1/2] added use of ABC to mark TextGenerator as abstract --- dbldatagen/text_generators.py | 7 ++++++- tests/test_text_generation.py | 12 ++++++++---- tests/test_text_generator_basic.py | 22 ++++++++++++++++------ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 965350be..bcad0a07 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -10,6 +10,7 @@ import random import logging +from abc import ABC, abstractmethod import numpy as np import pandas as pd @@ -61,7 +62,7 @@ 'LABORUM'] -class TextGenerator(object): +class TextGenerator(ABC): """ Base class for text generation classes """ @@ -161,6 +162,10 @@ def getAsTupleOrElse(v, defaultValue, valueName): return defaultValue + @abstractmethod + def pandasGenerateText(self, v): + raise NotImplementedError("Subclasses should implement unique versions of `pandasGenerateText`") + class TemplateGenerator(TextGenerator): # lgtm [py/missing-equals] """This class handles the generation of text from templates diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index fb23d9d3..bec8df11 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -1,9 +1,9 @@ import re -import pytest -import pandas as pd -import numpy as np +import numpy as np +import pandas as pd import pyspark.sql.functions as F +import pytest from pyspark.sql.types import BooleanType, DateType from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType @@ -44,9 +44,13 @@ class TestTextGeneration: row_count = 100000 partitions_requested = 4 + class TestTextGenerator(TextGenerator): + def pandasGenerateText(self, v): # pylint: disable=useless-parent-delegation + return super().pandasGenerateText(v) + def test_text_generator_basics(self): # test the random humber generator - tg1 = TextGenerator() + tg1 = self.TestTextGenerator() # test the repr desc = repr(tg1) diff --git a/tests/test_text_generator_basic.py b/tests/test_text_generator_basic.py index 238212e7..d3a5d8f4 100644 --- a/tests/test_text_generator_basic.py +++ b/tests/test_text_generator_basic.py @@ -1,7 +1,8 @@ import re -import pytest + import numpy as np import pandas as pd +import pytest from dbldatagen import TextGenerator, TemplateGenerator @@ -12,10 +13,14 @@ class TestTextGeneratorBasic: row_count = 100000 partitions_requested = 4 + class TestTextGenerator(TextGenerator): + def pandasGenerateText(self, v): # pylint: disable=useless-parent-delegation + return super().pandasGenerateText(v) + @pytest.mark.parametrize("randomSeed", [None, 0, -1, 2112, 42]) def test_text_generator_basic(self, randomSeed): - text_gen1 = TextGenerator() - text_gen2 = TextGenerator() + text_gen1 = self.TestTextGenerator() + text_gen2 = self.TestTextGenerator() if randomSeed is not None: text_gen1 = text_gen1.withRandomSeed(randomSeed) @@ -29,14 +34,19 @@ def test_text_generator_basic(self, randomSeed): assert text_gen1 == text_gen2 + def test_base_textgenerator_raises_error(self): + with pytest.raises(NotImplementedError): + text_gen1 = self.TestTextGenerator() + text_gen1.pandasGenerateText(None) + @pytest.mark.parametrize("randomSeed, forceNewInstance", [(None, True), (None, False), (0, True), (0, False), (-1, True), (-1, False), (2112, True), (2112, False), (42, True), (42, False)]) def test_text_generator_rng(self, randomSeed, forceNewInstance): - text_gen1 = TextGenerator() - text_gen2 = TextGenerator() + text_gen1 = self.TestTextGenerator() + text_gen2 = self.TestTextGenerator() if randomSeed is not None: text_gen1 = text_gen1.withRandomSeed(randomSeed) @@ -71,7 +81,7 @@ def test_text_generator_rng(self, randomSeed, forceNewInstance): (np.array([1, 40000.4, 3]), np.uint16) ]) def test_text_generator_compact_types(self, values, expectedType): - text_gen1 = TextGenerator() + text_gen1 = self.TestTextGenerator() np_type = text_gen1.compactNumpyTypeForValues(values) assert np_type == expectedType From e5c3c3c32deaefa6918875612bd5f6956a4e4cd1 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Sun, 14 Sep 2025 20:31:30 -0400 Subject: [PATCH 2/2] Lint text generators module --- dbldatagen/text_generators.py | 757 +++++++++++++++++++--------------- pyproject.toml | 4 - 2 files changed, 416 insertions(+), 345 deletions(-) diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 36529c43..f1bb0fb6 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -8,33 +8,35 @@ import math import random - -import logging from abc import ABC, abstractmethod +from typing import Any + import numpy as np +import numpy.random import pandas as pd -from .serialization import SerializableToDict +from dbldatagen.serialization import SerializableToDict + #: list of hex digits for template generation -_HEX_LOWER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'] +_HEX_LOWER = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"] #: list of upper case hex digits for template generation -_HEX_UPPER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'] +_HEX_UPPER = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"] #: list of non-zero digits for template generation -_DIGITS_NON_ZERO = ['1', '2', '3', '4', '5', '6', '7', '8', '9'] +_DIGITS_NON_ZERO = ["1", "2", "3", "4", "5", "6", "7", "8", "9"] #: list of digits for template generation -_DIGITS_ZERO = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] +_DIGITS_ZERO = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] #: list of uppercase letters for template generation -_LETTERS_UPPER = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'T', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z'] +_LETTERS_UPPER = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "T", "S", "U", "V", "W", "X", "Y", "Z"] #: list of lowercase letters for template generation -_LETTERS_LOWER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', - 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] +_LETTERS_LOWER = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", + "r", "s", "t", "u", "v", "w", "x", "y", "z"] #: list of all letters uppercase and lowercase _LETTERS_ALL = _LETTERS_LOWER + _LETTERS_UPPER @@ -46,61 +48,71 @@ _ALNUM_UPPER = _LETTERS_UPPER + _DIGITS_ZERO """ words for ipsum lorem based text generation""" -_WORDS_LOWER = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit', 'sed', 'do', - 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', 'ut', - 'enim', 'ad', 'minim', 'veniam', 'quis', 'nostrud', 'exercitation', 'ullamco', 'laboris', - 'nisi', 'ut', 'aliquip', 'ex', 'ea', 'commodo', 'consequat', 'duis', 'aute', 'irure', 'dolor', - 'in', 'reprehenderit', 'in', 'voluptate', 'velit', 'esse', 'cillum', 'dolore', 'eu', 'fugiat', - 'nulla', 'pariatur', 'excepteur', 'sint', 'occaecat', 'cupidatat', 'non', 'proident', 'sunt', - 'in', 'culpa', 'qui', 'officia', 'deserunt', 'mollit', 'anim', 'id', 'est', 'laborum'] - -_WORDS_UPPER = ['LOREM', 'IPSUM', 'DOLOR', 'SIT', 'AMET', 'CONSECTETUR', 'ADIPISCING', 'ELIT', 'SED', 'DO', - 'EIUSMOD', 'TEMPOR', 'INCIDIDUNT', 'UT', 'LABORE', 'ET', 'DOLORE', 'MAGNA', 'ALIQUA', 'UT', - 'ENIM', 'AD', 'MINIM', 'VENIAM', 'QUIS', 'NOSTRUD', 'EXERCITATION', 'ULLAMCO', 'LABORIS', - 'NISI', 'UT', 'ALIQUIP', 'EX', 'EA', 'COMMODO', 'CONSEQUAT', 'DUIS', 'AUTE', 'IRURE', - 'DOLOR', 'IN', 'REPREHENDERIT', 'IN', 'VOLUPTATE', 'VELIT', 'ESSE', 'CILLUM', 'DOLORE', - 'EU', 'FUGIAT', 'NULLA', 'PARIATUR', 'EXCEPTEUR', 'SINT', 'OCCAECAT', 'CUPIDATAT', 'NON', - 'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST', - 'LABORUM'] +_WORDS_LOWER = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", "sed", "do", + "eiusmod", "tempor", "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua", "ut", + "enim", "ad", "minim", "veniam", "quis", "nostrud", "exercitation", "ullamco", "laboris", + "nisi", "ut", "aliquip", "ex", "ea", "commodo", "consequat", "duis", "aute", "irure", "dolor", + "in", "reprehenderit", "in", "voluptate", "velit", "esse", "cillum", "dolore", "eu", "fugiat", + "nulla", "pariatur", "excepteur", "sint", "occaecat", "cupidatat", "non", "proident", "sunt", + "in", "culpa", "qui", "officia", "deserunt", "mollit", "anim", "id", "est", "laborum"] + +_WORDS_UPPER = ["LOREM", "IPSUM", "DOLOR", "SIT", "AMET", "CONSECTETUR", "ADIPISCING", "ELIT", "SED", "DO", + "EIUSMOD", "TEMPOR", "INCIDIDUNT", "UT", "LABORE", "ET", "DOLORE", "MAGNA", "ALIQUA", "UT", + "ENIM", "AD", "MINIM", "VENIAM", "QUIS", "NOSTRUD", "EXERCITATION", "ULLAMCO", "LABORIS", + "NISI", "UT", "ALIQUIP", "EX", "EA", "COMMODO", "CONSEQUAT", "DUIS", "AUTE", "IRURE", + "DOLOR", "IN", "REPREHENDERIT", "IN", "VOLUPTATE", "VELIT", "ESSE", "CILLUM", "DOLORE", + "EU", "FUGIAT", "NULLA", "PARIATUR", "EXCEPTEUR", "SINT", "OCCAECAT", "CUPIDATAT", "NON", + "PROIDENT", "SUNT", "IN", "CULPA", "QUI", "OFFICIA", "DESERUNT", "MOLLIT", "ANIM", "ID", "EST", + "LABORUM"] class TextGenerator(ABC): - """ Base class for text generation classes - """ + Base class for all text generation classes. + """ + _randomSeed: int + _rngInstance: numpy.random.Generator | None - def __init__(self): + def __init__(self) -> None: self._randomSeed = 42 self._rngInstance = None - def __repr__(self): + def __repr__(self) -> str: return f"TextGenerator(randomSeed={self._randomSeed})" - def __str__(self): + def __str__(self) -> str: return f"TextGenerator(randomSeed={self._randomSeed})" - def __eq__(self, other): - return isinstance(self, type(other)) and self._randomSeed == other._randomSeed + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + return self._randomSeed == other._randomSeed - def withRandomSeed(self, seed): - """ Set the random seed for the text generator + def withRandomSeed(self, seed: int) -> "TextGenerator": + """ + Sets the TextGenerator's random seed. - :param seed: seed value to set - :return: self + :param seed: Random seed value + :return: Text generator with the specified seed value """ - assert seed is None or type(seed) is int, "expecting an integer seed for Text Generator" + assert not seed or isinstance(seed, int), "expecting an integer seed for Text Generator" self._randomSeed = seed return self @property - def randomSeed(self): - """ Get random seed for text generator""" + def randomSeed(self) -> int: + """ + Gets the TextGenerator's random seed. + + :return: Random seed value + """ return self._randomSeed - def getNPRandomGenerator(self, forceNewInstance=False): - """ Get numpy random number generator + def getNPRandomGenerator(self, forceNewInstance: bool = False) -> numpy.random.Generator: + """ + Gets a NumPy random number generator. - :return: returns random number generator initialized from previously supplied random seed + :return: Random number generator initialized from previously supplied random seed. """ assert self._randomSeed is None or type(self._randomSeed) in [int, np.int32, np.int64], \ f"`random_seed` must be int or int-like not {type(self._randomSeed)}" @@ -108,24 +120,24 @@ def getNPRandomGenerator(self, forceNewInstance=False): if self._rngInstance is not None and not forceNewInstance: return self._rngInstance - from numpy.random import default_rng if self._randomSeed is not None and self._randomSeed not in (-1, -1.0): - rng = default_rng(seed=self._randomSeed) + rng = numpy.random.default_rng(seed=self._randomSeed) else: - rng = default_rng() + rng = numpy.random.default_rng() if not forceNewInstance: self._rngInstance = rng return rng @staticmethod - def compactNumpyTypeForValues(listValues): - """ determine smallest numpy type to represent values + def compactNumpyTypeForValues(listValues: list | numpy.ndarray) -> np.dtype: + """ + Determines the smallest numpy type to represent the supplied values. - :param listValues: list or np.ndarray of values to get np.dtype for - :return: np.dtype that is most compact representation for values provided + :param listValues: List or `np.ndarray` of values to get `np.dtype` for + :return: `np.dtype` that is most compact representation for values provided """ - if type(listValues) is list: + if isinstance(listValues, list): max_value_represented = np.max(np.array(listValues).flatten()) else: max_value_represented = np.max(listValues.flatten()) + 1 @@ -133,7 +145,7 @@ def compactNumpyTypeForValues(listValues): if bits_required <= 8: # for small values, use byte representation - retval = np.dtype('B') + retval = np.dtype("B") else: # compute bytes required and raise to nearest power of 2 bytesRequired = int(math.ceil(bits_required / 8.0)) @@ -141,7 +153,11 @@ def compactNumpyTypeForValues(listValues): return retval @staticmethod - def getAsTupleOrElse(v, defaultValue, valueName): + def getAsTupleOrElse( + v: int | tuple[int, int] | None, + defaultValue: tuple[int, int], + valueName: str = "value" + ) -> tuple[int, int]: """ get value v as tuple or return default value :param v: value to test @@ -149,33 +165,44 @@ def getAsTupleOrElse(v, defaultValue, valueName): :param valueName: name of value for debugging and logging purposes :returns: return `v` as tuple if not `None` or value of `default_v` if `v` is `None`. If `v` is a single value, returns the tuple (`v`, `v`)""" - assert v is None or type(v) is int or type(v) is tuple, f"param {valueName} must be an int, a tuple or None" - assert type(defaultValue) is tuple and len(defaultValue) == 2, "default value must be tuple" + assert not v or isinstance(v, int | tuple), f"param {valueName} must be an int, a tuple or None" + assert isinstance(defaultValue, tuple) and len(defaultValue) == 2, "default value must be tuple" - if type(v) is int: - return v, v - elif type(v) is tuple: - assert len(v) == 2, "expecting tuple of length 2" - assert type(v[0]) is int and type(v[1]) is int, "expecting tuple with both elements as integers" - return v - else: + if not v: assert len(defaultValue) == 2, "must have list or iterable with lenght 2" - assert type(defaultValue[0]) is int and type(defaultValue[1]) is int, "all elements must be integers" + assert isinstance(defaultValue[0], int) and isinstance(defaultValue[1], int), \ + "all elements must be integers" + return defaultValue - return defaultValue + if isinstance(v, tuple): + assert len(v) == 2, "expecting tuple of length 2" + assert isinstance(v[0], int) and isinstance(v[1], int), "expecting tuple with both elements as integers" + return v[0], v[1] + + return v, v @abstractmethod - def pandasGenerateText(self, v): + def pandasGenerateText(self, v: pd.Series) -> pd.Series: + """ + Generates text from a template using Pandas. + + :param v: Pandas series of values passed as base values + :return: Pandas series of expanded templates + """ raise NotImplementedError("Subclasses should implement unique versions of `pandasGenerateText`") class TemplateGenerator(TextGenerator, SerializableToDict): # lgtm [py/missing-equals] - """This class handles the generation of text from templates + """ + This class handles the generation of text from templates. - :param template: template string to use in text generation - :param escapeSpecialChars: By default special chars in the template have special meaning if unescaped - If set to true, then the special meaning requires escape char ``\\`` - :param extendedWordList: if provided, use specified word list instead of default word list + :param template: Template string to use in text generation + :param escapeSpecialChars: Whether to escape special characters (e.g. "a" or "d") in the template (default is + ``False``). + * If ``False``, unescaped special characters correspond to character classes (e.g. "a" for lowercase alphabetical + characters). + * If ``True``, special characters must be escaped using ``\\``. + :param extendedWordList: Optional list of words to use during text generation The template generator generates text from a template to allow for generation of synthetic account card numbers, VINs, IBANs and many other structured codes. @@ -225,10 +252,34 @@ class TemplateGenerator(TextGenerator, SerializableToDict): # lgtm [py/missing- If set to True, then the template ``r"dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"`` when applied to the values zero to 999. This conforms to the preferred style going forward - """ - def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): - assert template is not None, "`template` must be specified" + + _template: str + _escapeSpecialChars: bool + _extendedWordList: list[str] | None + _escapeSpecialMeaning: bool + _templates: list[str] + _wordList: np.ndarray + _upperWordList: np.ndarray + _np_digits_zero: np.ndarray + _np_digits_non_zero: np.ndarray + _np_hex_upper: np.ndarray + _np_hex_lower: np.ndarray + _np_alnum_lower: np.ndarray + _np_alnum_upper: np.ndarray + _np_letters_lower: np.ndarray + _np_letters_upper: np.ndarray + _np_letters_all: np.ndarray + _lenWords: int + _templateMappings: dict[str, tuple[int, np.ndarray]] + _templateEscapedMappings: dict[str, tuple[int, np.ndarray | None]] + + def __init__( + self, + template: str, + escapeSpecialChars: bool = False, + extendedWordList: list[str] | None = None + ) -> None: super().__init__() self._template = template @@ -253,40 +304,40 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): # mappings must be mapping from string to tuple(length of mappings, mapping array or list) self._templateMappings = { - 'a': (26, self._np_letters_lower), - 'A': (26, self._np_letters_upper), - 'x': (16, self._np_hex_lower), - 'X': (16, self._np_hex_upper), - 'd': (10, self._np_digits_zero), - 'D': (9, self._np_digits_non_zero), - 'k': (36, self._np_alnum_lower), - 'K': (36, self._np_alnum_upper) + "a": (26, self._np_letters_lower), + "A": (26, self._np_letters_upper), + "x": (16, self._np_hex_lower), + "X": (16, self._np_hex_upper), + "d": (10, self._np_digits_zero), + "D": (9, self._np_digits_non_zero), + "k": (36, self._np_alnum_lower), + "K": (36, self._np_alnum_upper) } # ensure that each mapping is mapping from string to list or numpy array for k, v in self._templateMappings.items(): - assert (k is not None) and isinstance(k, str) and len(k) > 0, "key must be non-empty string" - assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" + assert k and isinstance(k, str) and len(k) > 0, "key must be non-empty string" + assert v and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" mapping_length, mappings = v assert isinstance(mapping_length, int), "mapping length must be of type int" - assert isinstance(mappings, (list, np.ndarray)), \ + assert isinstance(mappings, list | np.ndarray), \ "mappings are lists or numpy arrays" assert mapping_length == 0 or len(mappings) == mapping_length, "mappings must match mapping_length" self._templateEscapedMappings = { - 'n': (256, None), - 'N': (65536, None), - 'w': (self._lenWords, self._wordList), - 'W': (self._lenWords, self._upperWordList) + "n": (256, None), + "N": (65536, None), + "w": (self._lenWords, self._wordList), + "W": (self._lenWords, self._upperWordList) } # ensure that each escaped mapping is mapping from string to None, list or numpy array - for k, v in self._templateEscapedMappings.items(): - assert (k is not None) and isinstance(k, str) and len(k) > 0, "key must be non-empty string" - assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" + for k, v in self._templateEscapedMappings.items(): # type: ignore[assignment] + assert k and isinstance(k, str) and len(k) > 0, "key must be non-empty string" + assert v and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" mapping_length, mappings = v assert isinstance(mapping_length, int), "mapping length must be of type int" - assert mappings is None or isinstance(mappings, (list, np.ndarray)), \ + assert mappings is None or isinstance(mappings, list | np.ndarray), \ "mappings are lists or numpy arrays" # for escaped mappings, the mapping can be None in which case the mapping is to the number itself @@ -299,15 +350,82 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): template_info = [self._prepareTemplateStrings(template, escapeSpecialMeaning=escapeSpecialChars) for template in self._templates] - self._max_placeholders = max([ x[0] for x in template_info]) # pylint: disable=consider-using-generator - self._max_rnds_needed = max([ len(x[1]) for x in template_info]) # pylint: disable=consider-using-generator - self._placeholders_needed = [ x[0] for x in template_info] - self._template_rnd_bounds = [ x[1] for x in template_info] + self._max_placeholders = max(x[0] for x in template_info) + self._max_rnds_needed = max(len(x[1]) for x in template_info) + self._placeholders_needed = [x[0] for x in template_info] + self._template_rnd_bounds = [x[1] for x in template_info] - def __repr__(self): + def __repr__(self) -> str: return f"TemplateGenerator(template='{self._template}')" - def _toInitializationDict(self): + @property + def templates(self) -> list[str]: + """ Get effective templates for text generator""" + return self._templates + + def classicGenerateText(self, v: str) -> str: + """ + Generates text from a template. + + :param v: Value passed as a base value + :return: Expanded template + """ + + values = pd.Series([v]) + results = self.pandasGenerateText(values).iloc[0] + return str(results) + + def pandasGenerateText(self, v: pd.Series) -> pd.Series: + """ + Generates text from a template using Pandas. + + :param v: Pandas series of values passed as base values + :return: Pandas series of expanded templates + """ + # placeholders is numpy array used to hold results + placeholders = np.full((v.shape[0], self._max_placeholders), "", dtype=np.object_) + + # prepare template selections, bounds, rnd values to drive application of algorithm + template_choices, template_rnd_bounds, template_rnds = self._prepare_random_bounds(v) + template_choices_t = template_choices.T + + # create masked arrays, with all elements initially masked + # as we substitute template expansion, we'll mask and unmask rows corresponding to each template + # calling the method to substitute the values on the masked placeholders + masked_placeholders: np.ma.MaskedArray = np.ma.MaskedArray(placeholders, mask=False) + masked_rnds: np.ma.MaskedArray = np.ma.MaskedArray(template_rnds, mask=False) + masked_matrices = [masked_placeholders, masked_rnds] + + # test logic for template expansion + for x in range(len(self._templates)): # pylint: disable=consider-using-enumerate + masked_placeholders[template_choices_t != x, :] = np.ma.masked + masked_rnds[template_choices_t != x, :] = np.ma.masked + + # harden mask, preventing modifications + for m in masked_matrices: + np.ma.harden_mask(m) + + # expand values into placeholders without affect masked values + self._applyTemplateStringsForTemplate( + v, + self._templates[x], + masked_placeholders, + masked_rnds, + escapeSpecialMeaning=self._escapeSpecialMeaning + ) + + # soften and clear mask, allowing modifications + for m in masked_matrices: + np.ma.soften_mask(m) + m.mask = False + + # join strings in placeholders + output = pd.Series(list(placeholders)) + results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items])) + + return results + + def _toInitializationDict(self) -> dict[str, Any]: """ Converts an object to a Python dictionary. Keys represent the object's constructor arguments. :return: Python dictionary representation of the object @@ -324,60 +442,56 @@ def _toInitializationDict(self): for k, v in _options.items() if v is not None } - def _splitTemplates(self, templateStr): - """ Split template string into individual template strings - - :param templateStr: template string - :return: list of individual template strings - + @staticmethod + def _splitTemplates(templateStr: str) -> list[str]: + """ + Splits the template string into a list of template strings. + :param templateStr: Template string + :return: List of template strings """ - tmp_template = templateStr.replace(r'\\', '$__escape__').replace(r'\|', '$__sep__') - results = [x.replace('$__escape__', r'\\').replace('$__sep__', '|') for x in tmp_template.split('|')] + tmp_template = templateStr.replace(r"\\", "$__escape__").replace(r"\|", "$__sep__") + results = [x.replace("$__escape__", r"\\").replace("$__sep__", "|") for x in tmp_template.split("|")] return results - @property - def templates(self): - """ Get effective templates for text generator""" - return self._templates - - def _getRandomInt(self, low, high=-1, rng=None): - """ generate random integer between low and high inclusive + @staticmethod + def _getRandomInt(low: int, high: int = -1, rng: np.random.Generator | None = None) -> int | np.int32: + """ + Generates a random integer between the provided low and high values. - :param low: low value, if no high value is specified, treat low value as high value and low of 0 - :param high: high value for random number generation - :param rng: if provided, an instance of a numpy random number generator - :return: generated value + :param low: Low value, if no high value is specified, treat low value as high value and low of 0 + :param high: High value for random number generation + :param rng: A numpy random number generator to use for generating hte random value + :return: A random integer between the provided low and high values """ if high == -1: high = low low = 0 - if rng is not None: + if rng: # numpy interval is different to ``randint`` return rng.integers(low, high + 1, dtype=np.int32) # use standard random for now as it performs better when generating values one at a time return random.randint(low, high) - def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False): - """ Prepare list of random numbers needed to generate template in vectorized form - - :param genTemplate: template string to control text generation - :param escapeSpecialMeaning: if True, requires escape on special meaning chars. - :returns: tuple containing number of placeholders and vector of random values upper bounds + def _prepareTemplateStrings(self, genTemplate: str, escapeSpecialMeaning: bool = False) -> tuple[int, list[int]]: + """ + Prepares a list of random numbers needed to generate the template value in vectorized form. - The first element of the tuple is the number of placeholders needed to populate the template + :param genTemplate: Template string used to control text generation + :param escapeSpecialMeaning: If ``True``, requires escape on special meaning chars + :returns: A tuple with the number of placeholders and a vector of random values upper bounds - The second elememt is a vector of integer values which determine bounds for random number vector for - template generation + The first element of the tuple is the number of placeholders needed to populate the generated template. The + second element is a vector of integer values which determine bounds for random number vector for template + generation. Each element of the vector will be used to generate a random number between 0 and the element inclusive, - which is then used to select words from wordlists etc for template expansion + which is then used to select words (e.g. from wordlists) for template expansion. - `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing + The `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing for preferred new style template syntax. Specify as True to force escapes for special meanings,. - """ retval = [] @@ -393,9 +507,9 @@ def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False): char = genTemplate[i] following_char = genTemplate[i + 1] if i + 1 < template_len else None - if char == '\\': + if char == "\\": escape = True - elif use_value and ('0' <= char <= '9'): + elif use_value and ("0" <= char <= "9"): # val_index = int(char) # retval.append(str(baseValue[val_index])) num_placeholders += 1 @@ -408,18 +522,18 @@ def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False): escape = False elif (char in self._templateEscapedMappings) and escape: # handle case for ['n', 'N', 'w', 'W'] - bound, mappingArr = self._templateEscapedMappings[char] + bound, _mappingArr = self._templateEscapedMappings[char][0], self._templateEscapedMappings[char][1] retval.append(bound) num_placeholders += 1 escape = False - elif char == 'v' and escape: + elif char == "v" and escape: escape = False - if following_char is not None and ('0' <= following_char <= '9'): + if following_char is not None and ("0" <= following_char <= "9"): use_value = True else: num_placeholders += 1 # retval.append(str(baseValue)) - elif char == 'V' and escape: + elif char == "V" and escape: # retval.append(str(baseValue)) num_placeholders += 1 escape = False @@ -434,22 +548,29 @@ def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False): return num_placeholders, retval - def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, *, - escapeSpecialMeaning=False): - """ Vectorized implementation of template driven text substitution - - Apply substitutions to placeholders using random numbers - - :param baseValue: Pandas series or data frame of base value for applying template - :param genTemplate: template string to control text generation - :param placeholders: masked nparray of type np.object_ pre-allocated to hold strings emitted - :param rnds: masked numpy 2d array of random numbers needed for vectorized generation - :param escapeSpecialMeaning: if True, requires escape on special meaning chars. + def _applyTemplateStringsForTemplate( + self, + baseValue: pd.Series | pd.DataFrame, + genTemplate: str, + placeholders: np.ndarray, + rnds: np.ndarray, + *, + escapeSpecialMeaning: bool = False + ) -> np.ndarray: + """ + Vectorized implementation of template driven text substitution. Applies substitutions to placeholders using + random numbers. + + :param baseValue: Pandas Series or DataFrame of base value for applying the template + :param genTemplate: A template string to control text generation + :param placeholders: A masked nparray of type np.object_ pre-allocated to hold the generated strings + :param rnds: A masked numpy 2d array of random numbers needed for vectorized generation + :param escapeSpecialMeaning: If ``True``, requires escape on special meaning chars. :returns: placeholders The vectorized implementation populates the placeholder Numpy array with the substituted values. - `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing + The `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing for preferred new style template syntax. Specify as True to force escapes for special meanings,. .. note:: @@ -459,37 +580,44 @@ def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, will apply the template to rows to which that template applies. The template may be the empty string. - """ assert baseValue.shape[0] == placeholders.shape[0] assert baseValue.shape[0] == rnds.shape[0] _cached_values = {} - regularKeys = self._templateMappings.keys() - escapedKeys = self._templateEscapedMappings.keys() + regular_keys = self._templateMappings.keys() + escaped_keys = self._templateEscapedMappings.keys() - def _get_values_as_np_array(): + def _get_values_as_np_array() -> np.ndarray: """Get baseValue which is pd.Series or Dataframe as a numpy array and cache it""" if "np_values" not in _cached_values: _cached_values["np_values"] = baseValue.to_numpy() - return _cached_values["np_values"] + values = _cached_values["np_values"] + if not isinstance(values, np.ndarray): + raise TypeError("Value for 'np_values' should be of type 'np.ndarray'") - def _get_values_subelement(elem): + return values + + def _get_values_subelement(elem: int) -> np.ndarray: """Get element from base values as np array and cache it""" cache_key = f"v_{elem}" if cache_key not in _cached_values: np_values = _get_values_as_np_array() # element_values = [] - element_values = np.ndarray(np_values.shape[0], dtype=np_values.dtype) + element_values: np.ndarray = np.ndarray(np_values.shape[0], dtype=np_values.dtype) for x in range(baseValue.shape[0]): # element_values.append(baseValue[x][elem]) element_values[x] = baseValue[x][elem] _cached_values[cache_key] = element_values - return _cached_values[cache_key] + sub_element = _cached_values[cache_key] + if not isinstance(sub_element, np.ndarray): + raise TypeError("Sub-element value must be of type 'np.ndarray'") + + return sub_element escape = False use_value = False @@ -516,54 +644,53 @@ def _get_values_subelement(elem): char = genTemplate[i] following_char = genTemplate[i + 1] if i + 1 < template_len else None - if char == '\\': + if char == "\\": escape = True - elif use_value and ('0' <= char <= '9'): + elif use_value and ("0" <= char <= "9"): val_index = int(char) placeholders[:, num_placeholders] = _get_values_subelement(val_index) # placeholders[:, num_placeholders] = pd_base_values.apply(lambda x: str(x[val_index])) num_placeholders += 1 use_value = False - elif char in regularKeys and (not escape) ^ escapeSpecialMeaning: + elif char in regular_keys and (not escape) ^ escapeSpecialMeaning: # note vectorized lookup - `rnds[:, rnd_offset]` will get vertical column of # random numbers from `rnds` 2d array - bound, valueMappings = self._templateMappings[char] + bound, value_mappings = self._templateMappings[char] if unmasked_rows is not None: - placeholders[unmasked_rows, num_placeholders] = valueMappings[rnds[unmasked_rows, rnd_offset]] + placeholders[unmasked_rows, num_placeholders] = value_mappings[rnds[unmasked_rows, rnd_offset]] else: - placeholders[:, num_placeholders] = valueMappings[rnds[:, rnd_offset]] + placeholders[:, num_placeholders] = value_mappings[rnds[:, rnd_offset]] num_placeholders += 1 rnd_offset = rnd_offset + 1 escape = False # used for retval.append(_HEX_LOWER[self._getRandomInt(0, 15, rndGenerator)]) - elif char in escapedKeys and escape: - bound, valueMappings = self._templateEscapedMappings[char] + elif char in escaped_keys and escape: + _bound, value_mappings = self._templateEscapedMappings[char] # type: ignore[assignment] - if valueMappings is not None: + if value_mappings is not None: if unmasked_rows is not None: - placeholders[unmasked_rows, num_placeholders] = valueMappings[rnds[unmasked_rows, rnd_offset]] + placeholders[unmasked_rows, num_placeholders] = value_mappings[rnds[unmasked_rows, rnd_offset]] else: - placeholders[:, num_placeholders] = valueMappings[rnds[:, rnd_offset]] + placeholders[:, num_placeholders] = value_mappings[rnds[:, rnd_offset]] + elif unmasked_rows is not None: # type: ignore[unreachable] + placeholders[unmasked_rows, num_placeholders] = rnds[unmasked_rows, rnd_offset] else: - if unmasked_rows is not None: - placeholders[unmasked_rows, num_placeholders] = rnds[unmasked_rows, rnd_offset] - else: - placeholders[:, num_placeholders] = rnds[:, rnd_offset] + placeholders[:, num_placeholders] = rnds[:, rnd_offset] num_placeholders += 1 rnd_offset = rnd_offset + 1 # retval.append(str(self._getRandomInt(0, 255, rndGenerator))) escape = False - elif char == 'v' and escape: + elif char == "v" and escape: escape = False - if following_char is not None and ('0' <= following_char <= '9'): + if following_char is not None and ("0" <= following_char <= "9"): use_value = True else: placeholders[:, num_placeholders] = _get_values_as_np_array() num_placeholders += 1 # retval.append(str(baseValue)) - elif char == 'V' and escape: + elif char == "V" and escape: placeholders[:, num_placeholders] = _get_values_as_np_array() # retval.append(str(baseValue)) num_placeholders += 1 @@ -581,24 +708,15 @@ def _get_values_subelement(elem): return placeholders - def classicGenerateText(self, v): - """entry point to use for classic udfs""" - - pdValues = pd.Series([v]) - results = self.pandasGenerateText(pdValues) - return results[0] - - def _prepare_random_bounds(self, v): + def _prepare_random_bounds(self, v: pd.Series) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ - Prepare the random bounds for processing of the template expansion - - For each template, we will have a vector of random numbers to generate for expanding the template - - If we have multiple templates, there will be a separate vector of random numbers for each template + Prepares the random bounds for processing the template expansion. + For each template, we will have a vector of random numbers to generate for expanding the template. If we have + multiple templates, there will be a separate vector of random numbers for each template. - :param v: Pandas series of values passed as base values - :return: vector of templates chosen, template random bounds (1 for each substitution) and selected + :param v: Pandas Series of values passed as base values + :return: A vector of templates chosen, template random bounds (1 for each substitution) and selected random numbers for each row (as numpy array) """ # choose templates @@ -616,12 +734,12 @@ def _prepare_random_bounds(self, v): # populate template random numbers template_rnd_bounds = np.full((v.size, self._max_rnds_needed), -1) - masked_template_bounds = np.ma.MaskedArray(template_rnd_bounds, mask=False) + masked_template_bounds: np.ma.MaskedArray = np.ma.MaskedArray(template_rnd_bounds, mask=False) for i in range(num_templates): # assign the len_bounds_i = len(self._template_rnd_bounds[i]) - masked_template_bounds[templates_chosen.T == i, 0:len_bounds_i] = self._template_rnd_bounds[i] + masked_template_bounds[i == templates_chosen.T, 0:len_bounds_i] = self._template_rnd_bounds[i] masked_template_bounds[template_rnd_bounds == -1] = np.ma.masked @@ -633,75 +751,27 @@ def _prepare_random_bounds(self, v): return templates_chosen, template_rnd_bounds, template_rnds - def pandasGenerateText(self, v): - """ entry point to use for pandas udfs - - Implementation uses vectorized implementation of process - - :param v: Pandas series of values passed as base values - :return: Pandas series of expanded templates - - """ - # placeholders is numpy array used to hold results - placeholders = np.full((v.shape[0], self._max_placeholders), '', dtype=np.object_) - - # prepare template selections, bounds, rnd values to drive application of algorithm - template_choices, template_rnd_bounds, template_rnds = self._prepare_random_bounds(v) - template_choices_t = template_choices.T - - # create masked arrays, with all elements initially masked - # as we substitute template expansion, we'll mask and unmask rows corresponding to each template - # calling the method to substitute the values on the masked placeholders - masked_placeholders = np.ma.MaskedArray(placeholders, mask=False) - masked_rnds = np.ma.MaskedArray(template_rnds, mask=False) - # masked_base_values = np.ma.MaskedArray(baseValues, mask=False) - masked_matrices = [masked_placeholders, masked_rnds] - - # test logic for template expansion - for x in range(len(self._templates)): # pylint: disable=consider-using-enumerate - masked_placeholders[template_choices_t != x, :] = np.ma.masked - masked_rnds[template_choices_t != x, :] = np.ma.masked - # masked_base_values[template_choices_t != x] = np.ma.masked - - # harden mask, preventing modifications - for m in masked_matrices: - np.ma.harden_mask(m) - - # expand values into placeholders without affect masked values - #self._applyTemplateStringsForTemplate(v.to_numpy(dtype=np.object_), #masked_base_values, - self._applyTemplateStringsForTemplate(v, - # masked_base_values, - self._templates[x], - masked_placeholders, - masked_rnds, - escapeSpecialMeaning=self._escapeSpecialMeaning - ) - - # soften and clear mask, allowing modifications - for m in masked_matrices: - np.ma.soften_mask(m) - m.mask = False - - # join strings in placeholders - output = pd.Series(list(placeholders)) - results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items])) - - return results - class ILText(TextGenerator, SerializableToDict): # lgtm [py/missing-equals] - """ Class to generate Ipsum Lorem text paragraphs, words and sentences - - :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range - :param sentences: Number of sentences to generate. If tuple will generate random number in tuple range - :param words: Number of words per sentence to generate. If tuple, will generate random number in tuple range - + """ + This class generates Ipsum Lorem text paragraphs, words, and sentences. + + :param paragraphs: Number of paragraphs to generate. If a tuple is provided, we will generate a random number of + paragraphs in the provided range. + :param sentences: Number of sentences per paragraph to generate. If a tuple is provided, we will generate a random + number of sentences in the provided range. + :param words: Number of words per sentence to generate. If a tuple is provided, we will generate a random number of + words in the provided range. + :param extendedWordList: Optional list of words to use instead of the default Ipsum Lorem list. """ - def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None): - """ - Initialize the ILText with text generation parameters - """ + def __init__( + self, + paragraphs: int | tuple[int, int] | None = None, + sentences: int | tuple[int, int] | None = None, + words: int | tuple[int, int] | None = None, + extendedWordList: list[str] | None = None + ) -> None: assert paragraphs is not None or sentences is not None or words is not None, \ "At least one of the params `paragraphs`, `sentences` or `words` must be specified" @@ -725,96 +795,30 @@ def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList self._processStats() self._processWordList() - def _toInitializationDict(self): - """ Converts an object to a Python dictionary. Keys represent the object's - constructor arguments. - :return: Python dictionary representation of the object - """ - _options = { - "kind": self.__class__.__name__, - "paragraphs": self._paragraphs, - "sentences": self._sentences, - "words": self._words, - "extendedWordList": self._extendedWordList - } - return { - k: v._toInitializationDict() - if isinstance(v, SerializableToDict) else v - for k, v in _options.items() if v is not None - } - - def _processStats(self): - """ Compute the stats needed for the text generation """ - - vals = [self.paragraphs, self.sentences, self.words] - self._textGenerationValues = np.array(vals, dtype=self.compactNumpyTypeForValues(vals)) - self._minValues = self._textGenerationValues[:, 0] - self._maxValues = self._textGenerationValues[:, 1] - - self._meanValues = np.mean(self._textGenerationValues, axis=1) - - # we want to force wider spread of sentence length, so we're not simply computing the std_deviation - # - but computing a target std_dev that will spread sentence length - self._stdVals = self._meanValues / 2 - self._stdVals2 = np.std(self._textGenerationValues, axis=1) - - def _processWordList(self): - """ Set up the word lists""" - np_words = np.array(self.wordList, np.dtype(np.str_)) - np_capitalized_words = np.char.capitalize(np_words[:]) - - all_words = np_words[:] - - self._wordOffsetSize = all_words.size - self._sentenceEndOffset = all_words.size - self._paragraphEnd = self._sentenceEndOffset + 1 - self._wordSpaceOffset = self._paragraphEnd + 1 - self._emptyStringOffset = self._wordSpaceOffset + 1 - - punctuation = [". ", "\n\n", " ", ""] - all_words = np.concatenate((all_words, punctuation)) - - self._startOfCapitalsOffset = all_words.size - all_words = np.concatenate((all_words, np_capitalized_words, punctuation)) - - # for efficiency, we'll create list of words preceded by spaces - it will reduce memory consumption during join - # and array manipulation as we dont have to hold offset for space - self._startOfSpacedWordsOffset = all_words.size - - np_spaced_words = np.array([" " + x for x in self.wordList], np.dtype(np.str_)) - all_words = np.concatenate((all_words, np_spaced_words, punctuation)) - - # set up python list of all words so that we dont have to convert between numpy and python representations - self._allWordsSize = all_words.size - self._wordsAsPythonStrings = [str(x) for x in all_words] - - # get smallest type that can represent word offset - self._wordOffsetType = self.compactNumpyTypeForValues([all_words.size * 2 + 10]) - - def __repr__(self): + def __repr__(self) -> str: paras, sentences, words = self.paragraphs, self.sentences, self.words wl = self.wordList.__repr__ if self.wordList is not None else "None" return f"ILText(paragraphs={paras}, sentences={sentences}, words={words}, wordList={wl})" - def generateText(self, baseValues, rowCount=1): + def generateText(self, baseValues: list | pd.Series | np.ndarray, rowCount: int = 1) -> list[str] | pd.Series: """ generate text for seed based on configuration parameters. As it uses numpy, repeatability is restricted depending on version of the runtime - :param baseValues: list or array-like list of baseValues - :param rowCount: number of rows - :returns: list or Pandas series of generated strings of same size as input seed + :param baseValues: List or array-like list of baseValues + :param rowCount: Number of rows + :returns: List or Pandas series of generated strings of same size as input seed """ assert baseValues is not None, "`baseValues` param must be specified" rng = self.getNPRandomGenerator(forceNewInstance=True) word_offset_type = self._wordOffsetType - stats_shape = [rowCount, self.paragraphs[1], self.sentences[1], 3] # determine counts of paragraphs, sentences and words para_stats_raw = np.round(rng.normal(self._meanValues, self._stdVals2, size=stats_shape)) para_stats = np.clip(para_stats_raw, self._minValues, self._maxValues) + # Convert to the compact dtype after clipping para_stats = para_stats.astype(self._textGenerationValues.dtype) @@ -831,7 +835,6 @@ def generateText(self, baseValues, rowCount=1): output_shape = (rowCount, self.paragraphs[1], self.sentences[1], self.words[1]) # compute the masks for paragraphs, sentences, and words - # get the set of indices for shape - r = rows, p = paragraphs, s = sentences, w = words # the indices will produce a set of rows of values for each dimension # the mask is then produced by iterating comparing index with good value @@ -851,7 +854,7 @@ def generateText(self, baseValues, rowCount=1): final_mask = words_mask | para_mask | sentences_mask word_offsets = np.full(output_shape, dtype=word_offset_type, fill_value=self._emptyStringOffset) - masked_offsets = np.ma.MaskedArray(word_offsets, mask=final_mask) + masked_offsets: np.ma.MaskedArray = np.ma.MaskedArray(word_offsets, mask=final_mask) # note numpy random differs from standard random in that it never produces upper bound masked_offsets[~masked_offsets.mask] = rng.integers(self._wordOffsetSize, @@ -903,7 +906,7 @@ def generateText(self, baseValues, rowCount=1): terminated_paragraph_offsets = terminated_paragraph_offsets.reshape((rowCount, shape[1] * shape[2])) empty_string_offset = self._wordOffsetType.type(self._emptyStringOffset) - final_data = terminated_paragraph_offsets.filled(fill_value=empty_string_offset) + final_data = terminated_paragraph_offsets.filled(fill_value=empty_string_offset) # pylint: disable=no-member # its faster to manipulate text in data frames as numpy strings are fixed length all_python_words = self._wordsAsPythonStrings @@ -912,28 +915,100 @@ def generateText(self, baseValues, rowCount=1): # build our lambda expression, copying point to word list locally for efficiency empty_string_offsets = [self._emptyStringOffset, self._emptyStringOffset + self._startOfSpacedWordsOffset] - mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x if x1 not in empty_string_offsets])).strip() + # mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x if x1 not in empty_string_offsets])).strip() # mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x ])) # ... and execute it - results = base_results.apply(mk_str_fn, axis=1) + results = base_results.apply(lambda w: self._get_word(w, all_python_words, empty_string_offsets), axis=1) return results - def classicGenerateText(self, v): + def classicGenerateText(self, v: str) -> str: """ - classic udf entry point for text generation + Generates text using PySpark UDFs (non-Pandas). - :param v: base value to control generation of random numbers + :param v: Value passed as a base value + :return: Expanded template """ return self.generateText([v], 1)[0] - def pandasGenerateText(self, v): + def pandasGenerateText(self, v: pd.Series) -> pd.Series: """ - pandas udf entry point for text generation + Generates text with Pandas UDFs. - :param v: pandas series of base values for random text generation - :returns: Pandas series of generated strings + :param v: Pandas series of values passed as base values + :return: Pandas series of expanded templates """ rows = v.to_numpy() results = self.generateText(rows, rows.size) return pd.Series(results) + + def _toInitializationDict(self) -> dict[str, Any]: + """ + Converts an object to a Python dictionary. Keys represent the object's constructor arguments. + + :return: Python dictionary representation of the object + """ + _options = { + "kind": self.__class__.__name__, + "paragraphs": self._paragraphs, + "sentences": self._sentences, + "words": self._words, + "extendedWordList": self._extendedWordList + } + return _options + + def _processStats(self) -> None: + """ + Computes statistics needed for the text generation. + """ + + vals = [self.paragraphs, self.sentences, self.words] + self._textGenerationValues = np.array(vals, dtype=self.compactNumpyTypeForValues(vals)) + self._minValues = self._textGenerationValues[:, 0] + self._maxValues = self._textGenerationValues[:, 1] + + self._meanValues = np.mean(self._textGenerationValues, axis=1) + + # we want to force wider spread of sentence length, so we're not simply computing the std_deviation + # - but computing a target std_dev that will spread sentence length + self._stdVals = self._meanValues / 2 + self._stdVals2 = np.std(self._textGenerationValues, axis=1) + + def _processWordList(self) -> None: + """ + Sets up the word lists needed for text generation. + """ + np_words = np.array(self.wordList, np.dtype(np.str_)) + np_capitalized_words = np.char.capitalize(np_words[:]) + + all_words = np_words[:] + + self._wordOffsetSize = all_words.size + self._sentenceEndOffset = all_words.size + self._paragraphEnd = self._sentenceEndOffset + 1 + self._wordSpaceOffset = self._paragraphEnd + 1 + self._emptyStringOffset = self._wordSpaceOffset + 1 + + punctuation = [". ", "\n\n", " ", ""] + all_words = np.concatenate((all_words, np.array(punctuation))) + + self._startOfCapitalsOffset = all_words.size + all_words = np.concatenate((all_words, np_capitalized_words, np.array(punctuation))) + + # for efficiency, we'll create list of words preceded by spaces - it will reduce memory consumption during join + # and array manipulation as we dont have to hold offset for space + self._startOfSpacedWordsOffset = all_words.size + + np_spaced_words = np.array([" " + x for x in self.wordList], np.dtype(np.str_)) + all_words = np.concatenate((all_words, np_spaced_words, np.array(punctuation))) + + # set up python list of all words so that we dont have to convert between numpy and python representations + self._allWordsSize = all_words.size + self._wordsAsPythonStrings = [str(x) for x in all_words] + + # get smallest type that can represent word offset + self._wordOffsetType = self.compactNumpyTypeForValues([all_words.size * 2 + 10]) + + @staticmethod + def _get_word(elements: list[int], words: list[str], excluded: list[int]) -> str: + return "".join([words[element] for element in elements if element not in excluded]).strip() diff --git a/pyproject.toml b/pyproject.toml index 1f681db5..fd00733f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,7 +153,6 @@ exclude = [ "dbldatagen/nrange.py", "dbldatagen/schema_parser.py", "dbldatagen/text_generator_plugins.py", - "dbldatagen/text_generators.py" ] [tool.ruff.lint] @@ -240,7 +239,6 @@ ignore = [ "dbldatagen/schema_parser.py", "dbldatagen/serialization.py", "dbldatagen/text_generator_plugins.py", - "dbldatagen/text_generators.py", "dbldatagen/utils.py" ] @@ -280,7 +278,6 @@ ignore-paths = [ "dbldatagen/schema_parser.py", "dbldatagen/serialization.py", "dbldatagen/text_generator_plugins.py", - "dbldatagen/text_generators.py", "dbldatagen/utils.py" ] @@ -414,7 +411,6 @@ exclude = [ "dbldatagen/schema_parser.py", "dbldatagen/serialization.py", "dbldatagen/text_generator_plugins.py", - "dbldatagen/text_generators.py", "dbldatagen/utils.py" ] warn_return_any = true