pw_tokenizer: Tokenization options for proto - Add a proto option for marking a field as tokenized. - Create tools for automatically detokenizing protos with tokenized fields. - Add missing inputs (ELF files for tests) to pw_tokenizer. - Copy inputs to the out directory for generated packages. Change-Id: If724cdb5e24ff3a86e89690806aa77fd4e7fdbe9 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/47741 Commit-Queue: Wyatt Hepler <hepler@google.com> Pigweed-Auto-Submit: Wyatt Hepler <hepler@google.com> Reviewed-by: Keir Mierle <keir@google.com> Reviewed-by: Ewout van Bekkum <ewout@google.com>
diff --git a/pw_build/python.gni b/pw_build/python.gni index 4d4b014..b7bf71a 100644 --- a/pw_build/python.gni +++ b/pw_build/python.gni
@@ -323,6 +323,9 @@ if (defined(invoker.tests)) { sources += invoker.tests } + if (defined(invoker.inputs)) { + sources += invoker.inputs + } source_root = _source_root public_deps = _python_deps + _other_deps
diff --git a/pw_tokenizer/BUILD.gn b/pw_tokenizer/BUILD.gn index 8a0305f..520beb2 100644 --- a/pw_tokenizer/BUILD.gn +++ b/pw_tokenizer/BUILD.gn
@@ -20,6 +20,7 @@ import("$dir_pw_build/target_types.gni") import("$dir_pw_docgen/docs.gni") import("$dir_pw_fuzzer/fuzzer.gni") +import("$dir_pw_protobuf_compiler/proto.gni") import("$dir_pw_unit_test/test.gni") import("backend.gni") @@ -351,6 +352,12 @@ ] } +pw_proto_library("proto") { + sources = [ "options.proto" ] + prefix = "pw_tokenizer/proto" + python_package = "py" +} + declare_args() { # pw_JAVA_NATIVE_INTERFACE_INCLUDE_DIRS specifies the paths to use for # building Java Native Interface libraries. If no paths are provided, targets @@ -380,6 +387,9 @@ } pw_doc_group("docs") { - sources = [ "docs.rst" ] + sources = [ + "docs.rst", + "proto.rst", + ] inputs = [ "py/pw_tokenizer/encode.py" ] }
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst index 168bc4a..7d246de 100644 --- a/pw_tokenizer/docs.rst +++ b/pw_tokenizer/docs.rst
@@ -834,6 +834,16 @@ return Detokenizer(kDefaultDatabase); } +Protocol buffers +---------------- +``pw_tokenizer`` provides utilities for handling tokenized fields in protobufs. +See :ref:`module-pw_tokenizer-proto` for details. + +.. toctree:: + :hidden: + + proto.rst + Base64 format ============= The tokenizer encodes messages to a compact binary representation. Applications
diff --git a/pw_tokenizer/options.proto b/pw_tokenizer/options.proto new file mode 100644 index 0000000..bc0b87a --- /dev/null +++ b/pw_tokenizer/options.proto
@@ -0,0 +1,35 @@ +// Copyright 2021 The Pigweed Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +syntax = "proto3"; + +package pw.tokenizer; + +import "google/protobuf/descriptor.proto"; + +enum Tokenization { + // The field may contain plain text or any type of tokenized data (binary or + // prefixed Base64). + TOKENIZATION_OPTIONAL = 0; +} + +// Define the tokenized option, which indicates the data format for text in a +// bytes field. +extend google.protobuf.FieldOptions { + // The field number was randomly selected from the reserved, internal use + // field numbers (50000-99999). + // TODO(pwbug/393): Register with the Protobuf Global Extension Registry: + // https://github.com/protocolbuffers/protobuf/blob/HEAD/docs/options.md + Tokenization format = 78576; +}
diff --git a/pw_tokenizer/proto.rst b/pw_tokenizer/proto.rst new file mode 100644 index 0000000..2479dd4 --- /dev/null +++ b/pw_tokenizer/proto.rst
@@ -0,0 +1,138 @@ +.. _module-pw_tokenizer-proto: + +------------------------------------ +Tokenized fields in protocol buffers +------------------------------------ +Text may be represented in a few different ways: + +- Plain ASCII or UTF-8 text (``This is plain text``) +- Base64-encoded tokenized message (``$ibafcA==``) +- Binary-encoded tokenized message (``89 b6 9f 70``) +- Little-endian 32-bit integer token (``0x709fb689``) + +``pw_tokenizer`` provides tools for working with protobuf fields that may +contain tokenized text. + +Tokenized field protobuf option +=============================== +``pw_tokenizer`` provides the ``pw.tokenizer.format`` protobuf field option. +This option may be applied to a protobuf field to indicate that it may contain a +tokenized string. A string that is optionally tokenized is represented with a +single ``bytes`` field annotated with ``(pw.tokenizer.format) = +TOKENIZATION_OPTIONAL``. + +For example, the following protobuf has one field that may contain a tokenized +string. + +.. code-block:: protobuf + + message MessageWithOptionallyTokenizedField { + bytes just_bytes = 1; + bytes maybe_tokenized = 2 [(pw.tokenizer.format) = TOKENIZATION_OPTIONAL]; + string just_text = 3; + } + +Decoding optionally tokenized strings +===================================== +The encoding used for an optionally tokenized field is not recorded in the +protobuf. Despite this, the text can reliably be decoded. This is accomplished +by attempting to decode the field as binary or Base64 tokenized data before +treating it like plain text. + +The following diagram describes the decoding process for optionally tokenized +fields in detail. + +.. mermaid:: + + flowchart TD + start([Received bytes]) --> binary + + binary[Decode as<br>binary tokenized] --> binary_ok + binary_ok{Detokenizes<br>successfully?} -->|no| utf8 + binary_ok -->|yes| done_binary([Display decoded binary]) + + utf8[Decode as UTF-8] --> utf8_ok + utf8_ok{Valid UTF-8?} -->|no| base64_encode + utf8_ok -->|yes| base64 + + base64_encode[Encode as<br>tokenized Base64] --> display + display([Display encoded Base64]) + + base64[Decode as<br>Base64 tokenized] --> base64_ok + + base64_ok{Fully<br>or partially<br>detokenized?} -->|no| is_plain_text + base64_ok -->|yes| base64_results + + is_plain_text{Text is<br>printable?} -->|no| base64_encode + is_plain_text-->|yes| plain_text + + base64_results([Display decoded Base64]) + plain_text([Display text]) + +Potential decoding problems +--------------------------- +The decoding process for optionally tokenized fields will yield correct results +in almost every situation. In rare circumstances, it is possible for it to fail, +but these can be avoided with a low-overhead mitigation if desired. + +There are two ways in which the decoding process may fail. + +Accidentally interpreting plain text as tokenized binary +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If a plain-text string happens to decode as a binary tokenized message, the +incorrect message could be displayed. This is very unlikely to occur. While many +tokens will incidentally end up being valid UTF-8 strings, it is highly unlikely +that a device will happen to log one of these strings as plain text. The +overwhelming majority of these strings will be nonsense. + +If an implementation wishes to guard against this extremely improbable +situation, it is possible to prevent it. This situation is prevented by +appending 0xFF (or another byte never valid in UTF-8) to binary tokenized data +that happens to be valid UTF-8 (or all binary tokenized messages, if desired). +When decoding, if there is an extra 0xFF byte, it is discarded. + +Displaying undecoded binary as plain text instead of Base64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If a message fails to decode as binary tokenized and it is not valid UTF-8, it +is displayed as tokenized Base64. This makes it easily recognizable as a +tokenized message and makes it simple to decode later from the text output (for +example, with an updated token database). + +A binary message for which the token is not known may coincidentally be valid +UTF-8 or ASCII. 6.25% of 4-byte sequences are composed only of ASCII characters. +When decoding with an out-of-date token database, it is possible that some +binary tokenized messages will be displayed as plain text rather than tokenized +Base64. + +This situation is likely to occur, but should be infrequent. Even if it does +happen, it is not a serious issue. A very small number of strings will be +displayed incorrectly, but these strings cannot be decoded anyway. One nonsense +string (e.g. ``a-D1``) would be displayed instead of another (``$YS1EMQ==``). +Updating the token database would resolve the issue, though the non-Base64 logs +would be difficult decode later from a log file. + +This situation can be avoided with the same approach described in +`Accidentally interpreting plain text as tokenized binary`_. Appending +an invalid UTF-8 character prevents the undecoded binary message from being +interpreted as plain text. + +Python library +============== +The ``pw_tokenizer.proto`` module defines functions that may be used to +detokenize protobuf objects in Python. The function +:func:`pw_tokenizer.proto.detokenize_fields` detokenizes all fields annotated as +tokenized, replacing them with their detokenized version. For example: + +.. code-block:: python + + my_detokenizer = pw_tokenizer.Detokenizer(some_database) + + my_message = SomeMessage(tokenized_field=b'$YS1EMQ==') + pw_tokenizer.proto.detokenize_fields(my_detokenizer, my_message) + + assert my_message.tokenized_field == b'The detokenized string! Cool!' + +pw_tokenizer.proto +------------------ +.. automodule:: pw_tokenizer.proto + :members:
diff --git a/pw_tokenizer/py/BUILD.gn b/pw_tokenizer/py/BUILD.gn index 00128d1..b2d19e9 100644 --- a/pw_tokenizer/py/BUILD.gn +++ b/pw_tokenizer/py/BUILD.gn
@@ -15,9 +15,14 @@ import("//build_overrides/pigweed.gni") import("$dir_pw_build/python.gni") +import("$dir_pw_protobuf_compiler/proto.gni") pw_python_package("py") { - setup = [ "setup.py" ] + generate_setup = { + name = "pw_tokenizer" + version = "0.0.1" + extra_requires = [ "serial" ] + } sources = [ "generate_argument_types_macro.py", "generate_hash_macro.py", @@ -29,12 +34,14 @@ "pw_tokenizer/detokenize.py", "pw_tokenizer/elf_reader.py", "pw_tokenizer/encode.py", + "pw_tokenizer/proto/__init__.py", "pw_tokenizer/serial_detokenizer.py", "pw_tokenizer/tokens.py", ] tests = [ "database_test.py", "decode_test.py", + "detokenize_proto_test.py", "detokenize_test.py", "elf_reader_test.py", "encode_test.py", @@ -42,10 +49,18 @@ "tokens_test.py", "varint_test_data.py", ] + python_test_deps = [ ":test_proto.python" ] inputs = [ "elf_reader_test_binary.elf", "example_binary_with_tokenized_strings.elf", "example_legacy_binary_with_tokenized_strings.elf", ] + proto_library = "..:proto" pylintrc = "$dir_pigweed/.pylintrc" } + +pw_proto_library("test_proto") { + sources = [ "detokenize_proto_test.proto" ] + deps = [ "..:proto" ] + prefix = "pw_tokenizer_tests" +}
diff --git a/pw_tokenizer/py/detokenize_proto_test.proto b/pw_tokenizer/py/detokenize_proto_test.proto new file mode 100644 index 0000000..ff9439b --- /dev/null +++ b/pw_tokenizer/py/detokenize_proto_test.proto
@@ -0,0 +1,24 @@ +// Copyright 2021 The Pigweed Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +syntax = "proto3"; + +package this_pigweed_test; + +import "pw_tokenizer/proto/options.proto"; + +message TheMessage { + bytes just_bytes = 1; + bytes message = 2 [(pw.tokenizer.format) = TOKENIZATION_OPTIONAL]; +}
diff --git a/pw_tokenizer/py/detokenize_proto_test.py b/pw_tokenizer/py/detokenize_proto_test.py new file mode 100644 index 0000000..d1bc269 --- /dev/null +++ b/pw_tokenizer/py/detokenize_proto_test.py
@@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright 2021 The Pigweed Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +"""Tests decoding a proto with tokenized fields.""" + +import unittest + +from pw_tokenizer_tests.detokenize_proto_test_pb2 import TheMessage + +from pw_tokenizer import detokenize, encode, tokens +from pw_tokenizer.proto import detokenize_fields + +_DATABASE = tokens.Database( + [tokens.TokenizedStringEntry(0xAABBCCDD, "Luke, we're gonna have %s")]) +_DETOKENIZER = detokenize.Detokenizer(_DATABASE) + + +class TestDetokenizeProtoFields(unittest.TestCase): + """Tests detokenizing optionally tokenized proto fields.""" + def test_plain_text(self) -> None: + proto = TheMessage(message=b'boring conversation anyway!') + detokenize_fields(_DETOKENIZER, proto) + self.assertEqual(proto.message, b'boring conversation anyway!') + + def test_binary(self) -> None: + proto = TheMessage(message=b'\xDD\xCC\xBB\xAA\x07company') + detokenize_fields(_DETOKENIZER, proto) + self.assertEqual(proto.message, b"Luke, we're gonna have company") + + def test_base64(self) -> None: + base64 = encode.prefixed_base64(b'\xDD\xCC\xBB\xAA\x07company') + proto = TheMessage(message=base64.encode()) + detokenize_fields(_DETOKENIZER, proto) + self.assertEqual(proto.message, b"Luke, we're gonna have company") + + def test_plain_text_with_prefixed_base64(self) -> None: + base64 = encode.prefixed_base64(b'\xDD\xCC\xBB\xAA\x09pancakes!') + proto = TheMessage(message=f'Good morning, {base64}'.encode()) + detokenize_fields(_DETOKENIZER, proto) + self.assertEqual(proto.message, + b"Good morning, Luke, we're gonna have pancakes!") + + def test_unknown_token_not_utf8(self) -> None: + proto = TheMessage(message=b'\xFE\xED\xF0\x0D') + detokenize_fields(_DETOKENIZER, proto) + self.assertEqual(proto.message.decode(), + encode.prefixed_base64(b'\xFE\xED\xF0\x0D')) + + def test_only_control_characters(self) -> None: + proto = TheMessage(message=b'\1\2\3\4') + detokenize_fields(_DETOKENIZER, proto) + self.assertEqual(proto.message.decode(), + encode.prefixed_base64(b'\1\2\3\4')) + + +if __name__ == '__main__': + unittest.main()
diff --git a/pw_tokenizer/py/pw_tokenizer/proto/__init__.py b/pw_tokenizer/py/pw_tokenizer/proto/__init__.py new file mode 100644 index 0000000..ac2e95d --- /dev/null +++ b/pw_tokenizer/py/pw_tokenizer/proto/__init__.py
@@ -0,0 +1,73 @@ +# Copyright 2021 The Pigweed Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +"""Utilities for working with tokenized fields in protobufs.""" + +from typing import Iterator + +from google.protobuf.descriptor import FieldDescriptor +from google.protobuf.message import Message + +from pw_tokenizer.proto import options_pb2 +from pw_tokenizer import detokenize, encode + + +def _tokenized_fields(proto: Message) -> Iterator[FieldDescriptor]: + for field in proto.DESCRIPTOR.fields: + extensions = field.GetOptions().Extensions + if options_pb2.format in extensions and extensions[ + options_pb2.format] == options_pb2.TOKENIZATION_OPTIONAL: + yield field + + +def decode_optionally_tokenized(detokenizer: detokenize.Detokenizer, + data: bytes) -> str: + """Decodes data that may be plain text or binary / Base64 tokenized text.""" + # Try detokenizing as binary. + result = detokenizer.detokenize(data) + if result.ok(): + return str(result) + + # Attempt to decode as UTF-8. + try: + text = data.decode() + except UnicodeDecodeError: + # Not UTF-8. Assume the token is unknown or the data is corrupt. + return encode.prefixed_base64(data) + + # See if the string is prefixed Base64 or contains prefixed Base64. + detokenized = detokenize.detokenize_base64(detokenizer, data) + if detokenized != data: # If anything detokenized successfully, use that. + return detokenized.decode() + + # Attempt to determine whether this is an unknown token or plain text. + # Any string with only printable or whitespace characters is plain text. + if ''.join(text.split()).isprintable(): + return text + + # Assume this field is tokenized data that could not be decoded. + return encode.prefixed_base64(data) + + +def detokenize_fields(detokenizer: detokenize.Detokenizer, + proto: Message) -> None: + """Detokenizes fields annotated as tokenized in the given proto. + + The fields are replaced with their detokenized version in the proto. + Tokenized fields are bytes fields, so the detokenized string is stored as + bytes. Call .decode() to convert the detokenized string from bytes to str. + """ + for field in _tokenized_fields(proto): + decoded = decode_optionally_tokenized(detokenizer, + getattr(proto, field.name)) + setattr(proto, field.name, decoded.encode())