You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
160 lines
5.5 KiB
160 lines
5.5 KiB
1 year ago
|
#!/usr/bin/python3
|
||
|
|
||
|
import json
|
||
|
import sys
|
||
|
|
||
|
BENCHMARK_FILENAME = "benchmark-data.txt"
|
||
|
OUTPUT_FILENAME = "benchmark-data.c"
|
||
|
|
||
|
def parse_attribute_line(line: str) -> dict:
|
||
|
words = line.split()
|
||
|
assert words[0] == "@attribute"
|
||
|
attribute = { "name": words[1] }
|
||
|
if words[2] == "numeric":
|
||
|
attribute["value-types"] = words[2]
|
||
|
elif words[2].startswith('{'):
|
||
|
values = words[2].lstrip('{').rstrip().rstrip('}').split(',')
|
||
|
attribute["value-types"] = "enum"
|
||
|
attribute["values"] = list(enumerate(values))
|
||
|
attribute["namedict"]: dict[str, int] = {}
|
||
|
for number, name in attribute["values"]:
|
||
|
attribute["namedict"][name] = number
|
||
|
x_max = len(attribute["values"]) - 1
|
||
|
attribute["normalized-values"]: list[float] = []
|
||
|
for value, _ in attribute["values"]:
|
||
|
attribute["normalized-values"].append(value / x_max)
|
||
|
return attribute
|
||
|
|
||
|
def parse_data(attributes: list, line: str) -> list[float]:
|
||
|
|
||
|
parsed_data = []
|
||
|
|
||
|
for fieldnum, field in enumerate(line.split(',')):
|
||
|
|
||
|
attr = attributes[fieldnum]
|
||
|
|
||
|
if attr["value-types"] == "numeric":
|
||
|
# Numeric field. Just copy it as is, we'll do the normalization later.
|
||
|
# Although keeping track of the min and max values for each field now
|
||
|
# would be more efficient.
|
||
|
parsed_data.append(float(field))
|
||
|
|
||
|
elif attr["value-types"] == "enum":
|
||
|
# Get the normalized numeric value for the current symbolic field
|
||
|
numeric_value: int = attributes[fieldnum]["namedict"].get(field)
|
||
|
assert numeric_value is not None
|
||
|
parsed_data.append(attributes[fieldnum]["normalized-values"][numeric_value])
|
||
|
|
||
|
else:
|
||
|
print("Unknown value type at field {} ({}). Line: {}"
|
||
|
.format(fieldnum, field, line))
|
||
|
print("attr: ", json.dumps(attr))
|
||
|
|
||
|
return parsed_data
|
||
|
|
||
|
def update_min_max(min_max: list[list[float]], parsed_line: list[float]) -> None:
|
||
|
for fieldnum, field in enumerate(parsed_line):
|
||
|
oldmin = min_max[0][fieldnum]
|
||
|
oldmax = min_max[1][fieldnum]
|
||
|
if oldmin is None or field < oldmin:
|
||
|
min_max[0][fieldnum] = field
|
||
|
if oldmax is None or field > oldmax:
|
||
|
min_max[1][fieldnum] = field
|
||
|
#min_max[0][fieldnum] = field if oldmin is None or field < oldmin else oldmin
|
||
|
#min_max[1][fieldnum] = field if oldmax is None or field > oldmax else oldmax
|
||
|
|
||
|
def normalize(data: list[list[float]],
|
||
|
min_max: list[list[float]],
|
||
|
attributes: list) -> list[list[float]]:
|
||
|
|
||
|
normalized_data: list[list[float]] = []
|
||
|
|
||
|
for line in data:
|
||
|
|
||
|
normalized_line = []
|
||
|
|
||
|
for fieldnum, field in enumerate(line):
|
||
|
|
||
|
# Fields with values of type enum are already normalized, so we
|
||
|
# should skip them
|
||
|
|
||
|
if attributes[fieldnum]["value-types"] == "enum":
|
||
|
normalized_line.append(field)
|
||
|
continue
|
||
|
|
||
|
|
||
|
x_min = min_max[0][fieldnum]
|
||
|
x_max = min_max[1][fieldnum]
|
||
|
|
||
|
if x_min == x_max:
|
||
|
if 0 <= field <= 1:
|
||
|
normalized_line.append(field)
|
||
|
else:
|
||
|
print("Problem with field {} ({}). Line: {}"
|
||
|
.format(fieldnum, field, line))
|
||
|
print("attr: ", json.dumps(attributes[fieldnum]))
|
||
|
normalized_line.append('ERROR')
|
||
|
continue
|
||
|
|
||
|
normalized_value = (field - x_min) / (x_max - x_min)
|
||
|
normalized_line.append(normalized_value)
|
||
|
|
||
|
normalized_data.append(normalized_line)
|
||
|
|
||
|
return normalized_data
|
||
|
|
||
|
def main():
|
||
|
attributes = []
|
||
|
data = []
|
||
|
min_max = [None, None]
|
||
|
with open(BENCHMARK_FILENAME, 'r', encoding="utf-8") as benchmark_file:
|
||
|
|
||
|
data_started = False
|
||
|
|
||
|
for line in benchmark_file:
|
||
|
|
||
|
if not line.rstrip():
|
||
|
continue
|
||
|
|
||
|
if not data_started:
|
||
|
if line.startswith('@'):
|
||
|
if line.startswith("@attribute "):
|
||
|
attributes.append(parse_attribute_line(line))
|
||
|
|
||
|
elif line.rstrip() == "@data":
|
||
|
#min_max[0] = [+float('inf')] * len(attributes)
|
||
|
#min_max[1] = [-float('inf')] * len(attributes)
|
||
|
min_max[0] = [None] * len(attributes)
|
||
|
min_max[1] = [None] * len(attributes)
|
||
|
data_started = True
|
||
|
else:
|
||
|
# Should not happen
|
||
|
print("What the hell happened here?", file=sys.stderr)
|
||
|
|
||
|
else:
|
||
|
# Data
|
||
|
parsed_line = parse_data(attributes, line.rstrip())
|
||
|
data.append(parsed_line)
|
||
|
update_min_max(min_max, parsed_line)
|
||
|
|
||
|
data = normalize(data, min_max, attributes)
|
||
|
|
||
|
print(json.dumps(attributes, indent=4))
|
||
|
print('\n' + ('-' * 76))
|
||
|
print(json.dumps(data, indent=4))
|
||
|
print('\n' + ('-' * 76))
|
||
|
print(json.dumps(list(zip(min_max[0], min_max[1],
|
||
|
[attr["name"] for attr in attributes])), indent=4))
|
||
|
|
||
|
with open(OUTPUT_FILENAME, 'w', encoding="utf-8") as out:
|
||
|
print("const double benchmark_data[{}][{}] = {}"
|
||
|
.format(len(data), len(attributes), '{'),
|
||
|
file=out)
|
||
|
#print('\n'.join(map(lambda l: ','.join(map(str, l)), data)), file=out)
|
||
|
for line in data:
|
||
|
print("\t{" + ", ".join([str(field) for field in line]) + "},", file=out)
|
||
|
print("};", file=out)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|