aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2025-09-09 20:09:53 -0400
committerBruce Hill <bruce@bruce-hill.com>2025-09-09 20:09:53 -0400
commitb4f2d03db2cd20688d6bb537904998e997bc48aa (patch)
tree380ce33e0ce1b7fa3ce7c88b5ca59a84b58f4821
parentd8a48f64111f542f3afeb5d6e47ff092f9278d9f (diff)
parentd64dcab138a34d5f5105e08f0a840f7cb5a1d159 (diff)
Merge branch 'main' into optional-list-indexingoptional-list-indexing
-rw-r--r--CHANGES.md4
-rw-r--r--api/api.md148
-rw-r--r--api/text.md148
-rw-r--r--api/text.yaml57
-rw-r--r--man/man3/tomo-Text.from_utf16.339
-rw-r--r--man/man3/tomo-Text.from_utf32.3 (renamed from man/man3/tomo-Text.from_codepoints.3)8
-rw-r--r--man/man3/tomo-Text.from_utf8.3 (renamed from man/man3/tomo-Text.from_bytes.3)8
-rw-r--r--man/man3/tomo-Text.utf16.336
-rw-r--r--man/man3/tomo-Text.utf32.3 (renamed from man/man3/tomo-Text.utf32_codepoints.3)8
-rw-r--r--man/man3/tomo-Text.utf8.3 (renamed from man/man3/tomo-Text.bytes.3)8
-rw-r--r--src/compile/headers.c2
-rw-r--r--src/compile/indexing.c4
-rw-r--r--src/compile/statements.c2
-rw-r--r--src/environment.c10
-rw-r--r--src/formatter/formatter.c2
-rw-r--r--src/modules.c26
-rw-r--r--src/modules.h4
-rw-r--r--src/stdlib/paths.c8
-rw-r--r--src/stdlib/text.c107
-rw-r--r--src/stdlib/text.h10
-rw-r--r--src/tomo.c62
-rw-r--r--src/typecheck.c12
-rw-r--r--test/text.tm38
23 files changed, 516 insertions, 235 deletions
diff --git a/CHANGES.md b/CHANGES.md
index 75195c78..ca8a3523 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,6 +9,10 @@
- List indexing now gives an optional value
- Added a `--format` flag to the `tomo` binary that autoformats your code
(currently unstable, do not rely on it just yet).
+- Standardized text methods for Unicode encodings:
+ - `Text.from_utf8()`/`Text.utf8()`
+ - `Text.from_utf16()`/`Text.utf16()`
+ - `Text.from_utf32()`/`Text.utf32()`
- Fixed bugs:
- `Int.parse()` had a memory bug.
diff --git a/api/api.md b/api/api.md
index 09d24e73..2eeecd89 100644
--- a/api/api.md
+++ b/api/api.md
@@ -4182,27 +4182,6 @@ for chunk in text.by_split_any(",;")
say(chunk)
```
-## Text.bytes
-
-```tomo
-Text.bytes : func(text: Text -> [Byte])
-```
-
-Converts a `Text` value to a list of bytes representing a UTF8 encoding of the text.
-
-Argument | Type | Description | Default
----------|------|-------------|---------
-text | `Text` | The text to be converted to UTF8 bytes. | -
-
-**Return:** A list of bytes (`[Byte]`) representing the text in UTF8 encoding.
-
-
-**Example:**
-```tomo
->> "Amélie".bytes()
-= [65, 109, 195, 169, 108, 105, 101]
-
-```
## Text.caseless_equals
```tomo
@@ -4306,29 +4285,6 @@ first | `Int` | The index to begin the slice. | -
= "lo"
```
-## Text.from_bytes
-
-```tomo
-Text.from_bytes : func(bytes: [Byte] -> [Text])
-```
-
-Returns text that has been constructed from the given UTF8 bytes.
-
-The text will be normalized, so the resulting text's UTF8 bytes may not exactly match the input.
-
-Argument | Type | Description | Default
----------|------|-------------|---------
-bytes | `[Byte]` | The UTF-8 bytes of the desired text. | -
-
-**Return:** A new text based on the input UTF8 bytes after normalization has been applied.
-
-
-**Example:**
-```tomo
->> Text.from_bytes([195, 133, 107, 101])
-= "Åke"
-
-```
## Text.from_c_string
```tomo
@@ -4377,10 +4333,35 @@ codepoint_names | `[Text]` | The names of each codepoint in the desired text (ca
= "Åke"
```
-## Text.from_codepoints
+## Text.from_utf16
+
+```tomo
+Text.from_utf16 : func(bytes: [Int16] -> [Text])
+```
+
+Returns text that has been constructed from the given UTF16 sequence.
+
+The text will be normalized, so the resulting text's UTF16 sequence may not exactly match the input.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+bytes | `[Int16]` | The UTF-16 integers of the desired text. | -
+
+**Return:** A new text based on the input UTF16 sequence after normalization has been applied.
+
+
+**Example:**
+```tomo
+>> Text.from_utf16([197, 107, 101])
+= "Åke"
+>> Text.from_utf16([12371, 12435, 12395, 12385, 12399, 19990, 30028])
+= "こんにちは世界".utf16()
+
+```
+## Text.from_utf32
```tomo
-Text.from_codepoints : func(codepoints: [Int32] -> [Text])
+Text.from_utf32 : func(codepoints: [Int32] -> [Text])
```
Returns text that has been constructed from the given UTF32 codepoints.
@@ -4396,7 +4377,30 @@ codepoints | `[Int32]` | The UTF32 codepoints in the desired text. | -
**Example:**
```tomo
->> Text.from_codepoints([197, 107, 101])
+>> Text.from_utf32([197, 107, 101])
+= "Åke"
+
+```
+## Text.from_utf8
+
+```tomo
+Text.from_utf8 : func(bytes: [Byte] -> [Text])
+```
+
+Returns text that has been constructed from the given UTF8 bytes.
+
+The text will be normalized, so the resulting text's UTF8 bytes may not exactly match the input.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+bytes | `[Byte]` | The UTF-8 bytes of the desired text. | -
+
+**Return:** A new text based on the input UTF8 bytes after normalization has been applied.
+
+
+**Example:**
+```tomo
+>> Text.from_utf8([195, 133, 107, 101])
= "Åke"
```
@@ -4916,10 +4920,33 @@ language | `Text` | The ISO 639 language code for which casing rules to use. |
= "İ"
```
-## Text.utf32_codepoints
+## Text.utf16
```tomo
-Text.utf32_codepoints : func(text: Text -> [Int32])
+Text.utf16 : func(text: Text -> [Int16])
+```
+
+Returns a list of Unicode code points for UTF16 encoding of the text.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+text | `Text` | The text from which to extract Unicode code points. | -
+
+**Return:** A list of 16-bit integer Unicode code points (`[Int16]`).
+
+
+**Example:**
+```tomo
+>> "Åke".utf16()
+= [197, 107, 101]
+>> "こんにちは世界".utf16()
+= [12371, 12435, 12395, 12385, 12399, 19990, 30028]
+
+```
+## Text.utf32
+
+```tomo
+Text.utf32 : func(text: Text -> [Int32])
```
Returns a list of Unicode code points for UTF32 encoding of the text.
@@ -4933,10 +4960,31 @@ text | `Text` | The text from which to extract Unicode code points. | -
**Example:**
```tomo
->> "Amélie".utf32_codepoints()
+>> "Amélie".utf32()
= [65, 109, 233, 108, 105, 101]
```
+## Text.utf8
+
+```tomo
+Text.utf8 : func(text: Text -> [Byte])
+```
+
+Converts a `Text` value to a list of bytes representing a UTF8 encoding of the text.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+text | `Text` | The text to be converted to UTF8 bytes. | -
+
+**Return:** A list of bytes (`[Byte]`) representing the text in UTF8 encoding.
+
+
+**Example:**
+```tomo
+>> "Amélie".utf8()
+= [65, 109, 195, 169, 108, 105, 101]
+
+```
## Text.width
```tomo
diff --git a/api/text.md b/api/text.md
index bdff6841..0d50ee24 100644
--- a/api/text.md
+++ b/api/text.md
@@ -130,27 +130,6 @@ for chunk in text.by_split_any(",;")
say(chunk)
```
-## Text.bytes
-
-```tomo
-Text.bytes : func(text: Text -> [Byte])
-```
-
-Converts a `Text` value to a list of bytes representing a UTF8 encoding of the text.
-
-Argument | Type | Description | Default
----------|------|-------------|---------
-text | `Text` | The text to be converted to UTF8 bytes. | -
-
-**Return:** A list of bytes (`[Byte]`) representing the text in UTF8 encoding.
-
-
-**Example:**
-```tomo
->> "Amélie".bytes()
-= [65, 109, 195, 169, 108, 105, 101]
-
-```
## Text.caseless_equals
```tomo
@@ -254,29 +233,6 @@ first | `Int` | The index to begin the slice. | -
= "lo"
```
-## Text.from_bytes
-
-```tomo
-Text.from_bytes : func(bytes: [Byte] -> [Text])
-```
-
-Returns text that has been constructed from the given UTF8 bytes.
-
-The text will be normalized, so the resulting text's UTF8 bytes may not exactly match the input.
-
-Argument | Type | Description | Default
----------|------|-------------|---------
-bytes | `[Byte]` | The UTF-8 bytes of the desired text. | -
-
-**Return:** A new text based on the input UTF8 bytes after normalization has been applied.
-
-
-**Example:**
-```tomo
->> Text.from_bytes([195, 133, 107, 101])
-= "Åke"
-
-```
## Text.from_c_string
```tomo
@@ -325,10 +281,35 @@ codepoint_names | `[Text]` | The names of each codepoint in the desired text (ca
= "Åke"
```
-## Text.from_codepoints
+## Text.from_utf16
+
+```tomo
+Text.from_utf16 : func(bytes: [Int16] -> [Text])
+```
+
+Returns text that has been constructed from the given UTF16 sequence.
+
+The text will be normalized, so the resulting text's UTF16 sequence may not exactly match the input.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+bytes | `[Int16]` | The UTF-16 integers of the desired text. | -
+
+**Return:** A new text based on the input UTF16 sequence after normalization has been applied.
+
+
+**Example:**
+```tomo
+>> Text.from_utf16([197, 107, 101])
+= "Åke"
+>> Text.from_utf16([12371, 12435, 12395, 12385, 12399, 19990, 30028])
+= "こんにちは世界".utf16()
+
+```
+## Text.from_utf32
```tomo
-Text.from_codepoints : func(codepoints: [Int32] -> [Text])
+Text.from_utf32 : func(codepoints: [Int32] -> [Text])
```
Returns text that has been constructed from the given UTF32 codepoints.
@@ -344,7 +325,30 @@ codepoints | `[Int32]` | The UTF32 codepoints in the desired text. | -
**Example:**
```tomo
->> Text.from_codepoints([197, 107, 101])
+>> Text.from_utf32([197, 107, 101])
+= "Åke"
+
+```
+## Text.from_utf8
+
+```tomo
+Text.from_utf8 : func(bytes: [Byte] -> [Text])
+```
+
+Returns text that has been constructed from the given UTF8 bytes.
+
+The text will be normalized, so the resulting text's UTF8 bytes may not exactly match the input.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+bytes | `[Byte]` | The UTF-8 bytes of the desired text. | -
+
+**Return:** A new text based on the input UTF8 bytes after normalization has been applied.
+
+
+**Example:**
+```tomo
+>> Text.from_utf8([195, 133, 107, 101])
= "Åke"
```
@@ -864,10 +868,33 @@ language | `Text` | The ISO 639 language code for which casing rules to use. |
= "İ"
```
-## Text.utf32_codepoints
+## Text.utf16
```tomo
-Text.utf32_codepoints : func(text: Text -> [Int32])
+Text.utf16 : func(text: Text -> [Int16])
+```
+
+Returns a list of Unicode code points for UTF16 encoding of the text.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+text | `Text` | The text from which to extract Unicode code points. | -
+
+**Return:** A list of 16-bit integer Unicode code points (`[Int16]`).
+
+
+**Example:**
+```tomo
+>> "Åke".utf16()
+= [197, 107, 101]
+>> "こんにちは世界".utf16()
+= [12371, 12435, 12395, 12385, 12399, 19990, 30028]
+
+```
+## Text.utf32
+
+```tomo
+Text.utf32 : func(text: Text -> [Int32])
```
Returns a list of Unicode code points for UTF32 encoding of the text.
@@ -881,10 +908,31 @@ text | `Text` | The text from which to extract Unicode code points. | -
**Example:**
```tomo
->> "Amélie".utf32_codepoints()
+>> "Amélie".utf32()
= [65, 109, 233, 108, 105, 101]
```
+## Text.utf8
+
+```tomo
+Text.utf8 : func(text: Text -> [Byte])
+```
+
+Converts a `Text` value to a list of bytes representing a UTF8 encoding of the text.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+text | `Text` | The text to be converted to UTF8 bytes. | -
+
+**Return:** A list of bytes (`[Byte]`) representing the text in UTF8 encoding.
+
+
+**Example:**
+```tomo
+>> "Amélie".utf8()
+= [65, 109, 195, 169, 108, 105, 101]
+
+```
## Text.width
```tomo
diff --git a/api/text.yaml b/api/text.yaml
index c8d70f0b..d209f4b3 100644
--- a/api/text.yaml
+++ b/api/text.yaml
@@ -129,7 +129,7 @@ Text.by_split_any:
# Prints: "one" then "two" then "three":
say(chunk)
-Text.bytes:
+Text.utf8:
short: get UTF8 bytes
description: >
Converts a `Text` value to a list of bytes representing a UTF8 encoding of
@@ -144,7 +144,7 @@ Text.bytes:
description: >
The text to be converted to UTF8 bytes.
example: |
- >> "Amélie".bytes()
+ >> "Amélie".utf8()
= [65, 109, 195, 169, 108, 105, 101]
Text.caseless_equals:
@@ -255,7 +255,7 @@ Text.from:
>> "hello".from(-2)
= "lo"
-Text.from_bytes:
+Text.from_utf8:
short: convert UTF8 byte list to text
description: >
Returns text that has been constructed from the given UTF8 bytes.
@@ -272,9 +272,31 @@ Text.from_bytes:
description: >
The UTF-8 bytes of the desired text.
example: |
- >> Text.from_bytes([195, 133, 107, 101])
+ >> Text.from_utf8([195, 133, 107, 101])
= "Åke"
+Text.from_utf16:
+ short: convert UTF16 list to text
+ description: >
+ Returns text that has been constructed from the given UTF16 sequence.
+ note: >
+ The text will be normalized, so the resulting text's UTF16 sequence may not
+ exactly match the input.
+ return:
+ type: '[Text]'
+ description: >
+ A new text based on the input UTF16 sequence after normalization has been applied.
+ args:
+ bytes:
+ type: '[Int16]'
+ description: >
+ The UTF-16 integers of the desired text.
+ example: |
+ >> Text.from_utf16([197, 107, 101])
+ = "Åke"
+ >> Text.from_utf16([12371, 12435, 12395, 12385, 12399, 19990, 30028])
+ = "こんにちは世界".utf16()
+
Text.from_c_string:
short: convert C-style string to text
description: >
@@ -318,7 +340,7 @@ Text.from_codepoint_names:
]
= "Åke"
-Text.from_codepoints:
+Text.from_utf32:
short: convert UTF32 codepoints to text
description: >
Returns text that has been constructed from the given UTF32 codepoints.
@@ -335,7 +357,7 @@ Text.from_codepoints:
description: >
The UTF32 codepoints in the desired text.
example: |
- >> Text.from_codepoints([197, 107, 101])
+ >> Text.from_utf32([197, 107, 101])
= "Åke"
Text.has:
@@ -906,7 +928,26 @@ Text.upper:
>> "i".upper(language="tr_TR")
= "İ"
-Text.utf32_codepoints:
+Text.utf16:
+ short: get UTF16 codepoints
+ description: >
+ Returns a list of Unicode code points for UTF16 encoding of the text.
+ return:
+ type: '[Int16]'
+ description: >
+ A list of 16-bit integer Unicode code points (`[Int16]`).
+ args:
+ text:
+ type: 'Text'
+ description: >
+ The text from which to extract Unicode code points.
+ example: |
+ >> "Åke".utf16()
+ = [197, 107, 101]
+ >> "こんにちは世界".utf16()
+ = [12371, 12435, 12395, 12385, 12399, 19990, 30028]
+
+Text.utf32:
short: get UTF32 codepoints
description: >
Returns a list of Unicode code points for UTF32 encoding of the text.
@@ -920,7 +961,7 @@ Text.utf32_codepoints:
description: >
The text from which to extract Unicode code points.
example: |
- >> "Amélie".utf32_codepoints()
+ >> "Amélie".utf32()
= [65, 109, 233, 108, 105, 101]
Text.width:
diff --git a/man/man3/tomo-Text.from_utf16.3 b/man/man3/tomo-Text.from_utf16.3
new file mode 100644
index 00000000..d4eaea02
--- /dev/null
+++ b/man/man3/tomo-Text.from_utf16.3
@@ -0,0 +1,39 @@
+'\" t
+.\" Copyright (c) 2025 Bruce Hill
+.\" All rights reserved.
+.\"
+.TH Text.from_utf16 3 2025-09-09 "Tomo man-pages"
+.SH NAME
+Text.from_utf16 \- convert UTF16 list to text
+.SH LIBRARY
+Tomo Standard Library
+.SH SYNOPSIS
+.nf
+.BI Text.from_utf16\ :\ func(bytes:\ [Int16]\ ->\ [Text])
+.fi
+.SH DESCRIPTION
+Returns text that has been constructed from the given UTF16 sequence.
+
+
+.SH ARGUMENTS
+
+.TS
+allbox;
+lb lb lbx lb
+l l l l.
+Name Type Description Default
+bytes [Int16] The UTF-16 integers of the desired text. -
+.TE
+.SH RETURN
+A new text based on the input UTF16 sequence after normalization has been applied.
+
+.SH NOTES
+The text will be normalized, so the resulting text's UTF16 sequence may not exactly match the input.
+
+.SH EXAMPLES
+.EX
+>> Text.from_utf16([197, 107, 101])
+= "Åke"
+>> Text.from_utf16([12371, 12435, 12395, 12385, 12399, 19990, 30028])
+= "こんにちは世界".utf16()
+.EE
diff --git a/man/man3/tomo-Text.from_codepoints.3 b/man/man3/tomo-Text.from_utf32.3
index d64abd98..31fc344f 100644
--- a/man/man3/tomo-Text.from_codepoints.3
+++ b/man/man3/tomo-Text.from_utf32.3
@@ -2,14 +2,14 @@
.\" Copyright (c) 2025 Bruce Hill
.\" All rights reserved.
.\"
-.TH Text.from_codepoints 3 2025-04-30 "Tomo man-pages"
+.TH Text.from_utf32 3 2025-09-09 "Tomo man-pages"
.SH NAME
-Text.from_codepoints \- convert UTF32 codepoints to text
+Text.from_utf32 \- convert UTF32 codepoints to text
.SH LIBRARY
Tomo Standard Library
.SH SYNOPSIS
.nf
-.BI Text.from_codepoints\ :\ func(codepoints:\ [Int32]\ ->\ [Text])
+.BI Text.from_utf32\ :\ func(codepoints:\ [Int32]\ ->\ [Text])
.fi
.SH DESCRIPTION
Returns text that has been constructed from the given UTF32 codepoints.
@@ -32,6 +32,6 @@ The text will be normalized, so the resulting text's codepoints may not exactly
.SH EXAMPLES
.EX
->> Text.from_codepoints([197, 107, 101])
+>> Text.from_utf32([197, 107, 101])
= "Åke"
.EE
diff --git a/man/man3/tomo-Text.from_bytes.3 b/man/man3/tomo-Text.from_utf8.3
index eec3843a..ead65dc6 100644
--- a/man/man3/tomo-Text.from_bytes.3
+++ b/man/man3/tomo-Text.from_utf8.3
@@ -2,14 +2,14 @@
.\" Copyright (c) 2025 Bruce Hill
.\" All rights reserved.
.\"
-.TH Text.from_bytes 3 2025-04-30 "Tomo man-pages"
+.TH Text.from_utf8 3 2025-09-09 "Tomo man-pages"
.SH NAME
-Text.from_bytes \- convert UTF8 byte list to text
+Text.from_utf8 \- convert UTF8 byte list to text
.SH LIBRARY
Tomo Standard Library
.SH SYNOPSIS
.nf
-.BI Text.from_bytes\ :\ func(bytes:\ [Byte]\ ->\ [Text])
+.BI Text.from_utf8\ :\ func(bytes:\ [Byte]\ ->\ [Text])
.fi
.SH DESCRIPTION
Returns text that has been constructed from the given UTF8 bytes.
@@ -32,6 +32,6 @@ The text will be normalized, so the resulting text's UTF8 bytes may not exactly
.SH EXAMPLES
.EX
->> Text.from_bytes([195, 133, 107, 101])
+>> Text.from_utf8([195, 133, 107, 101])
= "Åke"
.EE
diff --git a/man/man3/tomo-Text.utf16.3 b/man/man3/tomo-Text.utf16.3
new file mode 100644
index 00000000..2b3da2b1
--- /dev/null
+++ b/man/man3/tomo-Text.utf16.3
@@ -0,0 +1,36 @@
+'\" t
+.\" Copyright (c) 2025 Bruce Hill
+.\" All rights reserved.
+.\"
+.TH Text.utf16 3 2025-09-09 "Tomo man-pages"
+.SH NAME
+Text.utf16 \- get UTF16 codepoints
+.SH LIBRARY
+Tomo Standard Library
+.SH SYNOPSIS
+.nf
+.BI Text.utf16\ :\ func(text:\ Text\ ->\ [Int16])
+.fi
+.SH DESCRIPTION
+Returns a list of Unicode code points for UTF16 encoding of the text.
+
+
+.SH ARGUMENTS
+
+.TS
+allbox;
+lb lb lbx lb
+l l l l.
+Name Type Description Default
+text Text The text from which to extract Unicode code points. -
+.TE
+.SH RETURN
+A list of 16-bit integer Unicode code points (`[Int16]`).
+
+.SH EXAMPLES
+.EX
+>> "Åke".utf16()
+= [197, 107, 101]
+>> "こんにちは世界".utf16()
+= [12371, 12435, 12395, 12385, 12399, 19990, 30028]
+.EE
diff --git a/man/man3/tomo-Text.utf32_codepoints.3 b/man/man3/tomo-Text.utf32.3
index 0ada8954..ff37ba9c 100644
--- a/man/man3/tomo-Text.utf32_codepoints.3
+++ b/man/man3/tomo-Text.utf32.3
@@ -2,14 +2,14 @@
.\" Copyright (c) 2025 Bruce Hill
.\" All rights reserved.
.\"
-.TH Text.utf32_codepoints 3 2025-04-30 "Tomo man-pages"
+.TH Text.utf32 3 2025-09-09 "Tomo man-pages"
.SH NAME
-Text.utf32_codepoints \- get UTF32 codepoints
+Text.utf32 \- get UTF32 codepoints
.SH LIBRARY
Tomo Standard Library
.SH SYNOPSIS
.nf
-.BI Text.utf32_codepoints\ :\ func(text:\ Text\ ->\ [Int32])
+.BI Text.utf32\ :\ func(text:\ Text\ ->\ [Int32])
.fi
.SH DESCRIPTION
Returns a list of Unicode code points for UTF32 encoding of the text.
@@ -29,6 +29,6 @@ A list of 32-bit integer Unicode code points (`[Int32]`).
.SH EXAMPLES
.EX
->> "Amélie".utf32_codepoints()
+>> "Amélie".utf32()
= [65, 109, 233, 108, 105, 101]
.EE
diff --git a/man/man3/tomo-Text.bytes.3 b/man/man3/tomo-Text.utf8.3
index f9203ef3..80a91fb9 100644
--- a/man/man3/tomo-Text.bytes.3
+++ b/man/man3/tomo-Text.utf8.3
@@ -2,14 +2,14 @@
.\" Copyright (c) 2025 Bruce Hill
.\" All rights reserved.
.\"
-.TH Text.bytes 3 2025-04-30 "Tomo man-pages"
+.TH Text.utf8 3 2025-09-09 "Tomo man-pages"
.SH NAME
-Text.bytes \- get UTF8 bytes
+Text.utf8 \- get UTF8 bytes
.SH LIBRARY
Tomo Standard Library
.SH SYNOPSIS
.nf
-.BI Text.bytes\ :\ func(text:\ Text\ ->\ [Byte])
+.BI Text.utf8\ :\ func(text:\ Text\ ->\ [Byte])
.fi
.SH DESCRIPTION
Converts a `Text` value to a list of bytes representing a UTF8 encoding of the text.
@@ -29,6 +29,6 @@ A list of bytes (`[Byte]`) representing the text in UTF8 encoding.
.SH EXAMPLES
.EX
->> "Amélie".bytes()
+>> "Amélie".utf8()
= [65, 109, 195, 169, 108, 105, 101]
.EE
diff --git a/src/compile/headers.c b/src/compile/headers.c
index 6dc69f03..33a979cf 100644
--- a/src/compile/headers.c
+++ b/src/compile/headers.c
@@ -171,7 +171,7 @@ Text_t compile_statement_type_header(env_t *env, Path_t header_path, ast_t *ast)
Path_t build_dir = Path$resolved(Path$parent(header_path), Path$current_dir());
switch (use->what) {
case USE_MODULE: {
- module_info_t mod = get_module_info(ast);
+ module_info_t mod = get_used_module_info(ast);
glob_t tm_files;
const char *folder = mod.version ? String(mod.name, "_", mod.version) : mod.name;
if (glob(String(TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", folder, "/[!._0-9]*.tm"), GLOB_TILDE, NULL,
diff --git a/src/compile/indexing.c b/src/compile/indexing.c
index 9d21eb4b..bb7bf6b9 100644
--- a/src/compile/indexing.c
+++ b/src/compile/indexing.c
@@ -54,7 +54,7 @@ Text_t compile_indexing(env_t *env, ast_t *ast, bool checked) {
if (table_type->default_value) {
return Texts("Table$get_or_default(", compile_to_pointer_depth(env, indexing->indexed, 0, false), ", ",
compile_type(table_type->key_type), ", ", compile_type(table_type->value_type), ", ",
- compile(env, indexing->index), ", ",
+ compile_to_type(env, indexing->index, table_type->key_type), ", ",
compile_to_type(env, table_type->default_value, table_type->value_type), ", ",
compile_type_info(container_t), ")");
} else if (checked) {
@@ -66,7 +66,7 @@ Text_t compile_indexing(env_t *env, ast_t *ast, bool checked) {
} else {
return Texts("Table$get_optional(", compile_to_pointer_depth(env, indexing->indexed, 0, false), ", ",
compile_type(table_type->key_type), ", ", compile_type(table_type->value_type), ", ",
- compile(env, indexing->index),
+ compile_to_type(env, indexing->index, table_type->key_type),
", "
"_, ",
promote_to_optional(table_type->value_type, Text("(*_)")), ", ",
diff --git a/src/compile/statements.c b/src/compile/statements.c
index bde9ae36..3fc44ac4 100644
--- a/src/compile/statements.c
+++ b/src/compile/statements.c
@@ -188,7 +188,7 @@ static Text_t _compile_statement(env_t *env, ast_t *ast) {
Text_t suffix = get_id_suffix(Path$as_c_string(path));
return with_source_info(env, ast, Texts("$initialize", suffix, "();\n"));
} else if (use->what == USE_MODULE) {
- module_info_t mod = get_module_info(ast);
+ module_info_t mod = get_used_module_info(ast);
glob_t tm_files;
const char *folder = mod.version ? String(mod.name, "_", mod.version) : mod.name;
if (glob(String(TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", folder, "/[!._0-9]*.tm"), GLOB_TILDE, NULL,
diff --git a/src/environment.c b/src/environment.c
index 7ac54a7a..b551abf1 100644
--- a/src/environment.c
+++ b/src/environment.c
@@ -343,16 +343,16 @@ env_t *global_env(bool source_mapping) {
{"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"}, //
{"by_split", "Text$by_split", "func(text:Text, delimiter='' -> func(->Text?))"}, //
{"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, //
- {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"}, //
{"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, //
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, //
{"ends_with", "Text$ends_with", "func(text,suffix:Text, remainder:&Text? = none -> Bool)"}, //
{"from", "Text$from", "func(text:Text, first:Int -> Text)"}, //
- {"from_bytes", "Text$from_bytes", "func(bytes:[Byte] -> Text?)"}, //
{"from_c_string", "Text$from_str", "func(str:CString -> Text?)"}, //
{"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text] -> Text?)"}, //
- {"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32] -> Text)"}, //
{"from_text", "Path$from_text", "func(text:Text -> Path)"}, //
+ {"from_utf8", "Text$from_utf8", "func(bytes:[Byte] -> Text?)"}, //
+ {"from_utf16", "Text$from_utf16", "func(codepoints:[Int16] -> Text?)"}, //
+ {"from_utf32", "Text$from_utf32", "func(codepoints:[Int32] -> Text?)"}, //
{"has", "Text$has", "func(text:Text, target:Text -> Bool)"}, //
{"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"}, //
{"layout", "Text$layout", "func(text:Text -> Text)"}, //
@@ -375,7 +375,9 @@ env_t *global_env(bool source_mapping) {
{"translate", "Text$translate", "func(text:Text, translations:{Text=Text} -> Text)"}, //
{"trim", "Text$trim", "func(text:Text, to_trim=\" \t\r\n\", left=yes, right=yes -> Text)"}, //
{"upper", "Text$upper", "func(text:Text, language='C' -> Text)"}, //
- {"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"}, //
+ {"utf8", "Text$utf8", "func(text:Text -> [Byte])"}, //
+ {"utf16", "Text$utf16", "func(text:Text -> [Int16])"}, //
+ {"utf32", "Text$utf32", "func(text:Text -> [Int32])"}, //
{"width", "Text$width", "func(text:Text, language='C' -> Int)"}, //
{"without_prefix", "Text$without_prefix", "func(text,prefix:Text -> Text)"}, //
{"without_suffix", "Text$without_suffix", "func(text,suffix:Text -> Text)"}),
diff --git a/src/formatter/formatter.c b/src/formatter/formatter.c
index b68b3874..2c52aa63 100644
--- a/src/formatter/formatter.c
+++ b/src/formatter/formatter.c
@@ -54,7 +54,7 @@ PUREFUNC text_opts_t choose_text_options(ast_list_t *chunks) {
}
static bool starts_with_id(Text_t text) {
- List_t codepoints = Text$utf32_codepoints(Text$slice(text, I_small(1), I_small(1)));
+ List_t codepoints = Text$utf32(Text$slice(text, I_small(1), I_small(1)));
return uc_is_property_xid_continue(*(ucs4_t *)codepoints.data);
}
diff --git a/src/modules.c b/src/modules.c
index fafbbf86..9c562387 100644
--- a/src/modules.c
+++ b/src/modules.c
@@ -24,6 +24,28 @@
errx(1, "Failed to run command: %s", String(__VA_ARGS__)); \
})
+const char *get_library_version(Path_t lib_dir) {
+ Path_t changes_file = Path$child(lib_dir, Text("CHANGES.md"));
+ OptionalText_t changes = Path$read(changes_file);
+ if (changes.length <= 0) {
+ return "v0.0";
+ }
+ const char *changes_str = Text$as_c_string(Texts(Text("\n"), changes));
+ const char *version_line = strstr(changes_str, "\n## ");
+ if (version_line == NULL)
+ print_err("CHANGES.md in ", lib_dir, " does not have any valid versions starting with '## '");
+ return String(string_slice(version_line + 4, strcspn(version_line + 4, "\r\n")));
+}
+
+Text_t get_library_name(Path_t lib_dir) {
+ Text_t name = Path$base_name(lib_dir);
+ name = Text$without_prefix(name, Text("tomo-"));
+ name = Text$without_suffix(name, Text("-tomo"));
+ Text_t suffix = Texts(Text("_"), Text$from_str(get_library_version(lib_dir)));
+ if (!Text$ends_with(name, suffix, NULL)) name = Texts(name, suffix);
+ return name;
+}
+
bool install_from_modules_ini(Path_t ini_file, bool ask_confirmation) {
OptionalClosure_t by_line = Path$by_line(ini_file);
if (by_line.fn == NULL) return false;
@@ -72,7 +94,7 @@ find_section:;
}
}
-module_info_t get_module_info(ast_t *use) {
+module_info_t get_used_module_info(ast_t *use) {
static Table_t cache = {};
TypeInfo_t *cache_type = Table$info(Pointer$info("@", &Memory$info), Pointer$info("@", &Memory$info));
module_info_t **cached = Table$get(cache, &use, cache_type);
@@ -90,6 +112,8 @@ bool try_install_module(module_info_t mod, bool ask_confirmation) {
"_", Text$from_str(mod.version)));
if (Path$exists(dest)) return true;
+ print("No such path: ", dest);
+
if (mod.git) {
if (ask_confirmation) {
OptionalText_t answer =
diff --git a/src/modules.h b/src/modules.h
index c36d96dd..98911ec9 100644
--- a/src/modules.h
+++ b/src/modules.h
@@ -10,6 +10,8 @@ typedef struct {
const char *name, *version, *url, *git, *revision, *path;
} module_info_t;
-module_info_t get_module_info(ast_t *use);
+Text_t get_library_name(Path_t lib_dir);
+const char *get_library_version(Path_t lib_dir);
+module_info_t get_used_module_info(ast_t *use);
bool install_from_modules_ini(Path_t ini_file, bool ask_confirmation);
bool try_install_module(module_info_t mod, bool ask_confirmation);
diff --git a/src/stdlib/paths.c b/src/stdlib/paths.c
index 3de329a9..385f3bdf 100644
--- a/src/stdlib/paths.c
+++ b/src/stdlib/paths.c
@@ -290,7 +290,7 @@ static void _write(Path_t path, List_t bytes, int mode, int permissions) {
public
void Path$write(Path_t path, Text_t text, int permissions) {
- List_t bytes = Text$utf8_bytes(text);
+ List_t bytes = Text$utf8(text);
_write(path, bytes, O_WRONLY | O_CREAT | O_TRUNC, permissions);
}
@@ -301,7 +301,7 @@ void Path$write_bytes(Path_t path, List_t bytes, int permissions) {
public
void Path$append(Path_t path, Text_t text, int permissions) {
- List_t bytes = Text$utf8_bytes(text);
+ List_t bytes = Text$utf8(text);
_write(path, bytes, O_WRONLY | O_APPEND | O_CREAT, permissions);
}
@@ -367,7 +367,7 @@ public
OptionalText_t Path$read(Path_t path) {
List_t bytes = Path$read_bytes(path, NONE_INT);
if (bytes.length < 0) return NONE_TEXT;
- return Text$from_bytes(bytes);
+ return Text$from_utf8(bytes);
}
public
@@ -537,7 +537,7 @@ Path_t Path$write_unique_bytes(Path_t path, List_t bytes) {
}
public
-Path_t Path$write_unique(Path_t path, Text_t text) { return Path$write_unique_bytes(path, Text$utf8_bytes(text)); }
+Path_t Path$write_unique(Path_t path, Text_t text) { return Path$write_unique_bytes(path, Text$utf8(text)); }
public
Path_t Path$parent(Path_t path) {
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index bc5d4af3..dace7c9f 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -47,8 +47,8 @@
// (U+1F680) followed by THUMBS UP (U+1F44D), it will render on your screen as
// two things: a female astronaut and a thumbs up, and this is how most people
// will think about the text. If you wish to operate on the raw codepoints that
-// comprise the message, you are free to do so with the `.utf32_codepoints()`
-// method and `Text.from_codepoints()`, but this is not the default behavior.
+// comprise the message, you are free to do so with the `.utf32()`
+// method and `Text.from_utf32()`, but this is not the default behavior.
// The behavior for the given example is that `text.length == 2`, `text[1]` is
// the grapheme cluster representing a female astronaut emoji, and `text[2]` is
// the grapheme cluster representing the thumbs up emoji.
@@ -244,7 +244,7 @@ int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) {
synthetic_graphemes[-grapheme_id - 1].utf32_cluster = codepoint_copy;
arena += sizeof(ucs4_t[1 + utf32_len]);
- // Copy UTF8 bytes into the arena and store where they live:
+ // Copy UTF8 units into the arena and store where they live:
uint8_t *utf8_final = arena;
memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
@@ -520,8 +520,9 @@ static Text_t concat2(Text_t a, Text_t b) {
return concat2_assuming_safe(a, b);
}
- Text_t glue =
- Text$from_codepoints((List_t){.data = norm_buf, .length = (int64_t)norm_length, .stride = sizeof(int32_t)});
+ OptionalText_t glue =
+ Text$from_utf32((List_t){.data = norm_buf, .length = (int64_t)norm_length, .stride = sizeof(int32_t)});
+ assert(glue.length >= 0);
if (normalized != norm_buf) free(normalized);
@@ -818,7 +819,7 @@ OptionalText_t Text$from_strn(const char *str, size_t len) {
public
OptionalText_t Text$from_str(const char *str) { return str ? Text$from_strn(str, strlen(str)) : Text(""); }
-static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i) {
+static void u8_buf_append(Text_t text, Byte_t **buf, int64_t *capacity, int64_t *i) {
switch (text.tag) {
case TEXT_ASCII: {
if (*i + text.length > (int64_t)*capacity) {
@@ -905,7 +906,7 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i
public
char *Text$as_c_string(Text_t text) {
int64_t capacity = text.length + 1;
- char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
+ Byte_t *buf = GC_MALLOC_ATOMIC((size_t)capacity);
int64_t i = 0;
u8_buf_append(text, &buf, &capacity, &i);
@@ -914,7 +915,7 @@ char *Text$as_c_string(Text_t text) {
buf = GC_REALLOC(buf, (size_t)capacity);
}
buf[i] = '\0';
- return buf;
+ return (char *)buf;
}
PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t *info) {
@@ -1362,33 +1363,39 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t languag
public
Text_t Text$upper(Text_t text, Text_t language) {
if (text.length == 0) return text;
- List_t codepoints = Text$utf32_codepoints(text);
+ List_t codepoints = Text$utf32(text);
const char *uc_language = Text$as_c_string(language);
size_t out_len = 0;
ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, NULL, &out_len);
- Text_t ret = Text$from_codepoints((List_t){.data = upper, .length = (int64_t)out_len, .stride = sizeof(int32_t)});
+ OptionalText_t ret =
+ Text$from_utf32((List_t){.data = upper, .length = (int64_t)out_len, .stride = sizeof(int32_t)});
+ assert(ret.length >= 0);
return ret;
}
public
Text_t Text$lower(Text_t text, Text_t language) {
if (text.length == 0) return text;
- List_t codepoints = Text$utf32_codepoints(text);
+ List_t codepoints = Text$utf32(text);
const char *uc_language = Text$as_c_string(language);
size_t out_len = 0;
ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, NULL, &out_len);
- Text_t ret = Text$from_codepoints((List_t){.data = lower, .length = (int64_t)out_len, .stride = sizeof(int32_t)});
+ OptionalText_t ret =
+ Text$from_utf32((List_t){.data = lower, .length = (int64_t)out_len, .stride = sizeof(int32_t)});
+ assert(ret.length >= 0);
return ret;
}
public
Text_t Text$title(Text_t text, Text_t language) {
if (text.length == 0) return text;
- List_t codepoints = Text$utf32_codepoints(text);
+ List_t codepoints = Text$utf32(text);
const char *uc_language = Text$as_c_string(language);
size_t out_len = 0;
ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, NULL, &out_len);
- Text_t ret = Text$from_codepoints((List_t){.data = title, .length = (int64_t)out_len, .stride = sizeof(int32_t)});
+ OptionalText_t ret =
+ Text$from_utf32((List_t){.data = title, .length = (int64_t)out_len, .stride = sizeof(int32_t)});
+ assert(ret.length >= 0);
return ret;
}
@@ -1544,7 +1551,33 @@ List_t Text$clusters(Text_t text) {
}
public
-List_t Text$utf32_codepoints(Text_t text) {
+List_t Text$utf8(Text_t text) {
+ int64_t capacity = text.length + 1;
+ Byte_t *buf = GC_MALLOC_ATOMIC((size_t)capacity);
+ int64_t i = 0;
+ u8_buf_append(text, &buf, &capacity, &i);
+ return (List_t){.data = buf, .length = i, .stride = 1, .atomic = 1};
+}
+
+public
+List_t Text$utf16(Text_t text) {
+ if (text.length == 0) return (List_t){};
+ List_t utf32 = Text$utf32(text);
+ List_t utf16 = {.free = MIN(LIST_MAX_FREE_ENTRIES, (uint64_t)utf32.length), .atomic = 1};
+ utf16.data = GC_MALLOC_ATOMIC(sizeof(int32_t[utf16.free]));
+ for (int64_t i = 0; i < utf32.length; i++) {
+ uint16_t u16_buf[4];
+ size_t u16_len = sizeof(u16_buf) / sizeof(u16_buf[0]);
+ uint16_t *chunk_u16 = u32_to_u16(utf32.data + utf32.stride * i, 1, u16_buf, &u16_len);
+ if (chunk_u16 == NULL) fail("Invalid codepoints encountered!");
+ List$insert_all(&utf16, (List_t){.data = u16_buf, .stride = sizeof(uint16_t), .length = (int64_t)u16_len}, I(0),
+ sizeof(uint16_t));
+ }
+ return utf16;
+}
+
+public
+List_t Text$utf32(Text_t text) {
List_t codepoints = {.atomic = 1};
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < text.length; i++) {
@@ -1561,12 +1594,6 @@ List_t Text$utf32_codepoints(Text_t text) {
return codepoints;
}
-public
-List_t Text$utf8_bytes(Text_t text) {
- const char *str = Text$as_c_string(text);
- return (List_t){.length = (int64_t)strlen(str), .stride = 1, .atomic = 1, .data = (void *)str};
-}
-
static INLINE const char *codepoint_name(ucs4_t c) {
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *found_name = unicode_character_name(c, name);
@@ -1598,7 +1625,28 @@ List_t Text$codepoint_names(Text_t text) {
}
public
-Text_t Text$from_codepoints(List_t codepoints) {
+OptionalText_t Text$from_utf8(List_t units) {
+ if (units.stride != sizeof(int8_t)) List$compact(&units, sizeof(int8_t));
+ return Text$from_strn(units.data, (size_t)units.length);
+}
+
+public
+OptionalText_t Text$from_utf16(List_t units) {
+ if (units.length == 0) return EMPTY_TEXT;
+ if (units.stride != sizeof(int16_t)) List$compact(&units, sizeof(int16_t));
+
+ size_t length = 256;
+ uint8_t buf[length];
+ uint8_t *u8 = u16_to_u8(units.data, (size_t)units.length, buf, &length);
+ Text_t ret =
+ Text$from_utf8((List_t){.data = u8, .length = (int64_t)length, .stride = sizeof(uint8_t), .atomic = 1});
+ if (u8 != buf) free(u8);
+ return ret;
+}
+
+public
+OptionalText_t Text$from_utf32(List_t codepoints) {
+ if (codepoints.length == 0) return EMPTY_TEXT;
if (codepoints.stride != sizeof(uint32_t)) List$compact(&codepoints, sizeof(uint32_t));
List_t graphemes = {};
@@ -1610,6 +1658,7 @@ Text_t Text$from_codepoints(List_t codepoints) {
// Buffer for normalized cluster:
uint32_t buf[256];
size_t u32_normlen = sizeof(buf) / sizeof(buf[0]);
+ if (u32_check(pos, (size_t)(next - pos)) != NULL) return NONE_TEXT;
uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, pos, (size_t)(next - pos), buf, &u32_normlen);
int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
@@ -1625,8 +1674,9 @@ Text_t Text$from_codepoints(List_t codepoints) {
.data = (void *)next,
.stride = sizeof(int32_t),
};
- return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters),
- Text$from_codepoints(remaining_codepoints));
+ OptionalText_t remainder = Text$from_utf32(remaining_codepoints);
+ if (remainder.length < 0) return NONE_TEXT;
+ return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters), remainder);
}
}
return Text$from_components(graphemes, unique_clusters);
@@ -1642,14 +1692,7 @@ OptionalText_t Text$from_codepoint_names(List_t codepoint_names) {
if (codepoint == UNINAME_INVALID) return NONE_TEXT;
List$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
}
- return Text$from_codepoints(codepoints);
-}
-
-public
-OptionalText_t Text$from_bytes(List_t bytes) {
- if (bytes.stride != sizeof(int8_t)) List$compact(&bytes, sizeof(int8_t));
-
- return Text$from_strn(bytes.data, (size_t)bytes.length);
+ return Text$from_utf32(codepoints);
}
public
diff --git a/src/stdlib/text.h b/src/stdlib/text.h
index 0b341eb2..281c1880 100644
--- a/src/stdlib/text.h
+++ b/src/stdlib/text.h
@@ -92,12 +92,14 @@ Closure_t Text$by_split_any(Text_t text, Text_t delimiters);
Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right);
char *Text$as_c_string(Text_t text);
List_t Text$clusters(Text_t text);
-List_t Text$utf32_codepoints(Text_t text);
-List_t Text$utf8_bytes(Text_t text);
+List_t Text$utf8(Text_t text);
+List_t Text$utf16(Text_t text);
+List_t Text$utf32(Text_t text);
List_t Text$codepoint_names(Text_t text);
-Text_t Text$from_codepoints(List_t codepoints);
+OptionalText_t Text$from_utf8(List_t units);
+OptionalText_t Text$from_utf16(List_t units);
+OptionalText_t Text$from_utf32(List_t codepoints);
OptionalText_t Text$from_codepoint_names(List_t codepoint_names);
-OptionalText_t Text$from_bytes(List_t bytes);
List_t Text$lines(Text_t text);
Closure_t Text$by_line(Text_t text);
Text_t Text$join(Text_t glue, List_t pieces);
diff --git a/src/tomo.c b/src/tomo.c
index e6636cfc..7bcd4b60 100644
--- a/src/tomo.c
+++ b/src/tomo.c
@@ -88,10 +88,9 @@ static OptionalText_t show_codegen = NONE_TEXT,
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__APPLE__)
" -D_BSD_SOURCE"
#endif
- " -DGC_THREADS"
- " -I/usr/local/include"),
- ldlibs = Text("-lgc -lm -lgmp -lunistring -ltomo_" TOMO_VERSION),
- ldflags = Text(" -L/usr/local/lib"), optimization = Text("2"), cc = Text(DEFAULT_C_COMPILER);
+ " -DGC_THREADS"),
+ ldlibs = Text("-lgc -lm -lgmp -lunistring -ltomo_" TOMO_VERSION), ldflags = Text(""),
+ optimization = Text("2"), cc = Text(DEFAULT_C_COMPILER);
static Text_t config_summary,
// This will be either "" or "sudo -u <user>" or "doas -u <user>"
@@ -265,7 +264,7 @@ int main(int argc, char *argv[]) {
cflags = Texts(cflags, Text(" -Wno-parentheses-equality"));
}
- ldflags = Texts("-Wl,-rpath,'", TOMO_PATH, "/lib',-rpath,/usr/local/lib ", ldflags);
+ ldflags = Texts("-Wl,-rpath,'", TOMO_PATH, "/lib' ", ldflags);
#ifdef __APPLE__
cflags = Texts(cflags, Text(" -I/opt/homebrew/include"));
@@ -275,7 +274,8 @@ int main(int argc, char *argv[]) {
if (show_codegen.length > 0 && Text$equal_values(show_codegen, Text("pretty")))
show_codegen = Text("{ sed '/^#line/d;/^$/d' | clang-format | bat -l c -P; }");
- config_summary = Text$from_str(String(cc, " ", cflags, " -O", optimization));
+ config_summary = Texts("TOMO_VERSION=", TOMO_VERSION, "\n", "COMPILER=", cc, " ", cflags, " -O", optimization, "\n",
+ "SOURCE_MAPPING=", source_mapping ? Text("yes") : Text("no"), "\n");
Text_t owner = Path$owner(Path$from_str(TOMO_PATH), true);
Text_t user = Text$from_str(getenv("USER"));
@@ -411,26 +411,6 @@ Path_t build_file(Path_t path, const char *extension) {
return Path$child(build_dir, Texts(Path$base_name(path), Text$from_str(extension)));
}
-static const char *get_version(Path_t lib_dir) {
- Path_t changes_file = Path$child(lib_dir, Text("CHANGES.md"));
- OptionalText_t changes = Path$read(changes_file);
- if (changes.length <= 0) {
- return "v0.0";
- }
- const char *changes_str = Text$as_c_string(Texts(Text("\n"), changes));
- const char *version_line = strstr(changes_str, "\n## ");
- if (version_line == NULL)
- print_err("CHANGES.md in ", lib_dir, " does not have any valid versions starting with '## '");
- return String(string_slice(version_line + 4, strcspn(version_line + 4, "\r\n")));
-}
-
-static Path_t with_version_suffix(Path_t lib_dir) {
- Text_t suffix = Texts(Text("_"), Text$from_str(get_version(lib_dir)));
- return Text$ends_with(Path$base_name(lib_dir), suffix, NULL)
- ? lib_dir
- : Path$sibling(lib_dir, Texts(Path$base_name(lib_dir), suffix));
-}
-
void build_library(Path_t lib_dir) {
lib_dir = Path$resolved(lib_dir, Path$current_dir());
if (!Path$is_directory(lib_dir, true)) print_err("Not a valid directory: ", lib_dir);
@@ -441,8 +421,8 @@ void build_library(Path_t lib_dir) {
compile_files(env, tm_files, &object_files, &extra_ldlibs);
- Text_t versioned_dir = Path$base_name(with_version_suffix(lib_dir));
- Path_t shared_lib = Path$child(lib_dir, Texts(Text("lib"), versioned_dir, Text(SHARED_SUFFIX)));
+ Text_t lib_name = get_library_name(lib_dir);
+ Path_t shared_lib = Path$child(lib_dir, Texts(Text("lib"), lib_name, Text(SHARED_SUFFIX)));
if (!is_stale_for_any(shared_lib, object_files, false)) {
if (verbose) whisper("Unchanged: ", shared_lib);
return;
@@ -453,7 +433,7 @@ void build_library(Path_t lib_dir) {
" -Wl,-install_name,@rpath/'lib", Path$base_name(lib_dir), version_suffix, SHARED_SUFFIX,
"'"
#else
- " -Wl,-soname,'lib", versioned_dir, SHARED_SUFFIX,
+ " -Wl,-soname,'lib", lib_name, SHARED_SUFFIX,
"'"
#endif
" -shared ",
@@ -467,12 +447,11 @@ void build_library(Path_t lib_dir) {
}
void install_library(Path_t lib_dir) {
- Text_t lib_dir_name = Path$base_name(lib_dir);
- Text_t versioned_dir = Path$base_name(with_version_suffix(lib_dir));
- Path_t dest = Path$child(Path$from_str(String(TOMO_PATH, "/lib/tomo_" TOMO_VERSION)), versioned_dir);
+ Text_t lib_name = get_library_name(lib_dir);
+ Path_t dest = Path$child(Path$from_str(String(TOMO_PATH, "/lib/tomo_" TOMO_VERSION)), lib_name);
print("Installing ", lib_dir, " into ", dest);
if (!Path$equal_values(lib_dir, dest)) {
- if (verbose) whisper("Clearing out any pre-existing version of ", lib_dir_name);
+ if (verbose) whisper("Clearing out any pre-existing version of ", lib_name);
xsystem(as_owner, "rm -rf '", dest, "'");
if (verbose) whisper("Moving files to ", dest);
xsystem(as_owner, "mkdir -p '", dest, "'");
@@ -481,15 +460,15 @@ void install_library(Path_t lib_dir) {
}
// If we have `debugedit` on this system, use it to remap the debugging source information
// to point to the installed version of the source file. Otherwise, fail silently.
- if (verbose) whisper("Updating debug symbols for ", dest, "/lib", lib_dir_name, SHARED_SUFFIX);
+ if (verbose) whisper("Updating debug symbols for ", dest, "/lib", lib_name, SHARED_SUFFIX);
int result = system(String(as_owner, "debugedit -b ", lib_dir, " -d '", dest,
"'"
" '",
- dest, "/lib", versioned_dir, SHARED_SUFFIX,
+ dest, "/lib", lib_name, SHARED_SUFFIX,
"' "
">/dev/null 2>/dev/null"));
(void)result;
- print("Installed \033[1m", lib_dir_name, "\033[m to ", TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", versioned_dir);
+ print("Installed \033[1m", lib_dir, "\033[m to ", TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", lib_name);
}
void compile_files(env_t *env, List_t to_compile, List_t *object_files, List_t *extra_ldlibs) {
@@ -647,7 +626,7 @@ void build_file_dependency_graph(Path_t path, Table_t *to_compile, Table_t *to_l
break;
}
case USE_MODULE: {
- module_info_t mod = get_module_info(stmt_ast);
+ module_info_t mod = get_used_module_info(stmt_ast);
const char *full_name = mod.version ? String(mod.name, "_", mod.version) : mod.name;
Text_t lib = Texts("-Wl,-rpath,'", TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", Text$from_str(full_name),
"' '", TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", Text$from_str(full_name), "/lib",
@@ -802,7 +781,7 @@ void transpile_code(env_t *base_env, Path_t path) {
Text$print(c_file, c_code);
- const char *version = get_version(Path$parent(path));
+ const char *version = get_library_version(Path$parent(path));
binding_t *main_binding = get_binding(module_env, "main");
if (main_binding && main_binding->type->tag == FunctionType) {
type_t *ret = Match(main_binding->type, FunctionType)->ret;
@@ -857,8 +836,6 @@ Path_t compile_executable(env_t *base_env, Path_t path, Path_t exe_path, List_t
return exe_path;
}
- FILE *runner = run_cmd(cc, " ", cflags, " -O", optimization, " ", ldflags, " ", ldlibs, " ",
- list_text(extra_ldlibs), " ", paths_str(object_files), " -x c - -o ", exe_path);
Text_t program = Texts("extern int parse_and_run$$", main_binding->code,
"(int argc, char *argv[]);\n"
"__attribute__ ((noinline))\n"
@@ -867,6 +844,11 @@ Path_t compile_executable(env_t *base_env, Path_t path, Path_t exe_path, List_t
main_binding->code,
"(argc, argv);\n"
"}\n");
+ Path_t runner_file = build_file(path, ".runner.c");
+ Path$write(runner_file, program, 0644);
+
+ FILE *runner = run_cmd(cc, " ", cflags, " -O", optimization, " ", ldflags, " ", ldlibs, " ",
+ list_text(extra_ldlibs), " ", paths_str(object_files), " ", runner_file, " -o ", exe_path);
if (show_codegen.length > 0) {
FILE *out = run_cmd(show_codegen);
diff --git a/src/typecheck.c b/src/typecheck.c
index d7e87e65..e9968f09 100644
--- a/src/typecheck.c
+++ b/src/typecheck.c
@@ -165,15 +165,15 @@ PUREFUNC type_t *get_math_type(env_t *env, ast_t *ast, type_t *lhs_t, type_t *rh
return NULL;
}
-static env_t *load_module(env_t *env, ast_t *module_ast) {
- DeclareMatch(use, module_ast, Use);
+static env_t *load_module(env_t *env, ast_t *use_ast) {
+ DeclareMatch(use, use_ast, Use);
switch (use->what) {
case USE_LOCAL: {
- Path_t source_path = Path$from_str(module_ast->file->filename);
+ Path_t source_path = Path$from_str(use_ast->file->filename);
Path_t source_dir = Path$parent(source_path);
Path_t used_path = Path$resolved(Path$from_str(use->path), source_dir);
- if (!Path$exists(used_path)) code_err(module_ast, "No such file exists: ", quoted(use->path));
+ if (!Path$exists(used_path)) code_err(use_ast, "No such file exists: ", quoted(use->path));
env_t *module_env = Table$str_get(*env->imports, String(used_path));
if (module_env) return module_env;
@@ -183,12 +183,12 @@ static env_t *load_module(env_t *env, ast_t *module_ast) {
return load_module_env(env, ast);
}
case USE_MODULE: {
- module_info_t mod = get_module_info(module_ast);
+ module_info_t mod = get_used_module_info(use_ast);
glob_t tm_files;
const char *folder = mod.version ? String(mod.name, "_", mod.version) : mod.name;
if (glob(String(TOMO_PATH, "/lib/tomo_" TOMO_VERSION "/", folder, "/[!._0-9]*.tm"), GLOB_TILDE, NULL, &tm_files)
!= 0) {
- if (!try_install_module(mod, true)) code_err(module_ast, "Couldn't find or install library: ", folder);
+ if (!try_install_module(mod, true)) code_err(use_ast, "Couldn't find or install library: ", folder);
}
env_t *module_env = fresh_scope(env);
diff --git a/test/text.tm b/test/text.tm
index a32db2ce..46e1ae41 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -54,21 +54,21 @@ func main()
amelie := "Am\{UE9}lie"
>> amelie.split()
= ["A", "m", "é", "l", "i", "e"]
- >> amelie.utf32_codepoints()
+ >> amelie.utf32()
= [65, 109, 233, 108, 105, 101]
- >> amelie.bytes()
+ >> amelie.utf8()
= [0x41, 0x6D, 0xC3, 0xA9, 0x6C, 0x69, 0x65]
- >> Text.from_bytes([0x41, 0x6D, 0xC3, 0xA9, 0x6C, 0x69, 0x65])!
+ >> Text.from_utf8([0x41, 0x6D, 0xC3, 0xA9, 0x6C, 0x69, 0x65])!
= "Amélie"
- >> Text.from_bytes([Byte(0xFF)])
+ >> Text.from_utf8([Byte(0xFF)])
= none
amelie2 := "Am\{U65}\{U301}lie"
>> amelie2.split()
= ["A", "m", "é", "l", "i", "e"]
- >> amelie2.utf32_codepoints()
+ >> amelie2.utf32()
= [65, 109, 233, 108, 105, 101]
- >> amelie2.bytes()
+ >> amelie2.utf8()
= [0x41, 0x6D, 0xC3, 0xA9, 0x6C, 0x69, 0x65]
>> amelie.codepoint_names()
@@ -123,8 +123,8 @@ func main()
>> c.codepoint_names()
= ["LATIN CAPITAL LETTER E WITH ACUTE", "COMBINING VERTICAL LINE BELOW"]
assert c == Text.from_codepoint_names(c.codepoint_names())!
- assert c == Text.from_codepoints(c.utf32_codepoints())
- assert c == Text.from_bytes(c.bytes())!
+ assert c == Text.from_utf32(c.utf32())!
+ assert c == Text.from_utf8(c.utf8())!
>> "one\ntwo\nthree".lines()
= ["one", "two", "three"]
@@ -194,7 +194,7 @@ func main()
= 1
>> house.codepoint_names()
= ["CJK Unified Ideographs-5BB6"]
- >> house.utf32_codepoints()
+ >> house.utf32()
= [23478]
>> "🐧".codepoint_names()
@@ -253,24 +253,24 @@ func main()
do
- concat := "e" ++ Text.from_codepoints([Int32(0x300)])
+ concat := "e" ++ Text.from_utf32([Int32(0x300)])!
>> concat.length
= 1
- concat2 := concat ++ Text.from_codepoints([Int32(0x302)])
+ concat2 := concat ++ Text.from_utf32([Int32(0x302)])!
>> concat2.length
= 1
- concat3 := concat2 ++ Text.from_codepoints([Int32(0x303)])
+ concat3 := concat2 ++ Text.from_utf32([Int32(0x303)])!
>> concat3.length
= 1
- final := Text.from_codepoints([Int32(0x65), Int32(0x300), Int32(0x302), Int32(0x303)])
+ final := Text.from_utf32([Int32(0x65), Int32(0x300), Int32(0x302), Int32(0x303)])!
>> final.length
= 1
assert concat3 == final
- concat4 := Text.from_codepoints([Int32(0x65), Int32(0x300)]) ++ Text.from_codepoints([Int32(0x302), Int32(0x303)])
+ concat4 := Text.from_utf32([Int32(0x65), Int32(0x300)])! ++ Text.from_utf32([Int32(0x302), Int32(0x303)])!
>> concat4.length
= 1
assert concat4 == final
@@ -312,3 +312,13 @@ func main()
= ""
>> " ".trim(" ,", left=no)
= ""
+
+ do
+ test := "𤭢"
+ assert test.utf32() == [150370]
+ assert test.utf16() == [-10158, -8350]
+ assert test.utf8() == [0xf0, 0xa4, 0xad, 0xa2]
+
+ assert Text.from_utf32([150370]) == test
+ assert Text.from_utf16([-10158, -8350]) == test
+ assert Text.from_utf8([0xf0, 0xa4, 0xad, 0xa2]) == test