Upload folder using huggingface_hub
Browse files- .gitattributes +5 -0
- README.md +410 -0
- added_tokens.json +28 -0
- chat_template.jinja +13 -0
- config.json +68 -0
- generation_config.json +12 -0
- inference_example.py +26 -0
- merges.txt +0 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +406 -0
- propella.py +916 -0
- property_descriptions.md +1182 -0
- res/bf16_vs_fp8.png +3 -0
- res/eu_cofunding.png +3 -0
- res/overall_scores_by_model.png +3 -0
- res/per_property_scores_by_model.png +3 -0
- res/propella_logo.svg +40 -0
- special_tokens_map.json +31 -0
- tokenizer.json +3 -0
- tokenizer_config.json +239 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
res/bf16_vs_fp8.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
res/eu_cofunding.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
res/overall_scores_by_model.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
res/per_property_scores_by_model.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- eng
|
| 5 |
+
- spa
|
| 6 |
+
- ita
|
| 7 |
+
- fra
|
| 8 |
+
- deu
|
| 9 |
+
- pol
|
| 10 |
+
- ukr
|
| 11 |
+
- nld
|
| 12 |
+
- tha
|
| 13 |
+
- jpn
|
| 14 |
+
- heb
|
| 15 |
+
- ell
|
| 16 |
+
- kor
|
| 17 |
+
- isl
|
| 18 |
+
- dan
|
| 19 |
+
- cat
|
| 20 |
+
- slk
|
| 21 |
+
- rus
|
| 22 |
+
- kat
|
| 23 |
+
- por
|
| 24 |
+
- ben
|
| 25 |
+
- fas
|
| 26 |
+
- ekk
|
| 27 |
+
- fin
|
| 28 |
+
- tur
|
| 29 |
+
- swe
|
| 30 |
+
- ind
|
| 31 |
+
- ces
|
| 32 |
+
- lit
|
| 33 |
+
- slv
|
| 34 |
+
- vie
|
| 35 |
+
- eus
|
| 36 |
+
- bul
|
| 37 |
+
- mlt
|
| 38 |
+
- lvs
|
| 39 |
+
- nob
|
| 40 |
+
- hun
|
| 41 |
+
- urd
|
| 42 |
+
- ron
|
| 43 |
+
- glg
|
| 44 |
+
- gle
|
| 45 |
+
- nno
|
| 46 |
+
- ltg
|
| 47 |
+
- yue
|
| 48 |
+
- cmn
|
| 49 |
+
- hrv
|
| 50 |
+
- arb
|
| 51 |
+
- bos
|
| 52 |
+
- mkd
|
| 53 |
+
- srp
|
| 54 |
+
- hin
|
| 55 |
+
- als
|
| 56 |
+
- sqi
|
| 57 |
+
- est
|
| 58 |
+
- nor
|
| 59 |
+
- lav
|
| 60 |
+
- swa
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
<p align="center">
|
| 64 |
+
<img src="res/propella_logo.svg" alt="propella logo" width="150">
|
| 65 |
+
</p>
|
| 66 |
+
|
| 67 |
+
<h1 align="center">propella-1</h1>
|
| 68 |
+
<p align="center"><em>propel your data curation to the next level. </em></p>
|
| 69 |
+
|
| 70 |
+
propella-1 is a family of small multilingual LLMs that annotate text documents across six categories: core content, classification, quality & value, audience & purpose, safety & compliance, and geographic relevance. The annotations can be used to filter, select, and curate LLM training data at scale.
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
*Disclaimer: This is a research project and not an official ellamind product.*
|
| 76 |
+
|
| 77 |
+
*Curate with propella. Evaluate with [elluminate](https://elluminate.de).*
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## Highlights
|
| 82 |
+
|
| 83 |
+
- **Annotate 18 properties**: Covers well-established dimensions like content quality and educational value, plus underexplored ones like reasoning indicators and time-sensitivity.
|
| 84 |
+
- **Fast & accurate**: Small models (0.6B, 1.7B, 4B) that punch above their weight. Trained in fp8, ready for high-throughput inference.
|
| 85 |
+
- **Any text, any format**: Handles web pages, PDFs, code, math, post-training data and more.
|
| 86 |
+
- **Highly multilingual**: Supports 57 languages.
|
| 87 |
+
|
| 88 |
+
<p align="center">
|
| 89 |
+
<img src="res/overall_scores_by_model.png" alt="overall-performance-plot">
|
| 90 |
+
</p>
|
| 91 |
+
|
| 92 |
+
## The propella-1 family of models
|
| 93 |
+
| Model | Parameters | Performance| Docs/s (A100/H100) |
|
| 94 |
+
|-------|:----------:|:-------------:|:------------------:|
|
| 95 |
+
| [propella-1-4b](https://huggingface.co/ellamind/propella-1-4b) | 4B | 0.779 | 10.3 / 27.0 |
|
| 96 |
+
| [propella-1-1.7b](https://huggingface.co/ellamind/propella-1-1.7b) | 1.7B | 0.737 | 17.8 / 39.1 |
|
| 97 |
+
| [propella-1-0.6b](https://huggingface.co/ellamind/propella-1-0.6b) | 0.6B | 0.729 | 21.5 / 39.9 |
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
## Properties
|
| 101 |
+
propella-1 models evaluate documents across 18 properties organized into six categories:
|
| 102 |
+
| Category | Property | Short Description |
|
| 103 |
+
|----------|----------|-------------|
|
| 104 |
+
| **Core Content** | Content Integrity | Completeness and technical quality of the content |
|
| 105 |
+
| | Content Ratio | Proportion of content vs. navigation/UI elements |
|
| 106 |
+
| | Content Length | Amount of substantive content |
|
| 107 |
+
| **Classification** | One-Sentence Description | Ultra-short neutral description of the document |
|
| 108 |
+
| | Content Type | Functional structure and purpose |
|
| 109 |
+
| | Business Sector | Industry domain relevance |
|
| 110 |
+
| | Technical Content | Type and intensity of specialized knowledge |
|
| 111 |
+
| **Quality & Value** | Content Quality | Overall writing and presentation quality |
|
| 112 |
+
| | Information Density | Ratio of valuable information to redundancy |
|
| 113 |
+
| | Educational Value | Potential for teaching and learning |
|
| 114 |
+
| | Reasoning Indicators | Presence of logical reasoning and analysis |
|
| 115 |
+
| **Audience & Purpose** | Audience Level | Target sophistication level |
|
| 116 |
+
| | Commercial Bias | Commercial influence on objectivity |
|
| 117 |
+
| | Time-Sensitivity | How content value changes over time |
|
| 118 |
+
| **Safety & Compliance** | Content Safety | Presence of inappropriate or harmful content |
|
| 119 |
+
| | PII Presence | Contains personally identifiable information |
|
| 120 |
+
| **Geographic** | Regional Relevance | Primary regional/cultural context |
|
| 121 |
+
| | Country Relevance | Specific country relevance |
|
| 122 |
+
|
| 123 |
+
Read the [property reference](property_descriptions.md) for detailed definitions and enum values.
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
## Datasets annotated with propella-1
|
| 127 |
+
TBA. Stay tuned.
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
## Input
|
| 131 |
+
A text document in any of the [57 supported languages](#language-support).
|
| 132 |
+
|
| 133 |
+
### Output
|
| 134 |
+
|
| 135 |
+
A JSON object containing annotations. The output strictly conforms to a predefined schema with enumerated values for categorical properties.
|
| 136 |
+
|
| 137 |
+
<details>
|
| 138 |
+
<summary><strong>Example output</strong></summary>
|
| 139 |
+
|
| 140 |
+
```json
|
| 141 |
+
{
|
| 142 |
+
"content_integrity": "complete",
|
| 143 |
+
"content_ratio": "mostly_content",
|
| 144 |
+
"content_length": "moderate",
|
| 145 |
+
"one_sentence_description": "Technical documentation explaining how to define and evaluate structured LLM output schemas using elluminate's Python client.",
|
| 146 |
+
"content_type": [
|
| 147 |
+
"technical_documentation",
|
| 148 |
+
"instructional",
|
| 149 |
+
"source_code"
|
| 150 |
+
],
|
| 151 |
+
"business_sector": [
|
| 152 |
+
"technology_software"
|
| 153 |
+
],
|
| 154 |
+
"technical_content": [
|
| 155 |
+
"code_heavy"
|
| 156 |
+
],
|
| 157 |
+
"information_density": "dense",
|
| 158 |
+
"content_quality": "excellent",
|
| 159 |
+
"audience_level": "advanced",
|
| 160 |
+
"commercial_bias": "minimal",
|
| 161 |
+
"time_sensitivity": "slowly_changing",
|
| 162 |
+
"content_safety": "safe",
|
| 163 |
+
"educational_value": "high",
|
| 164 |
+
"reasoning_indicators": "explanatory",
|
| 165 |
+
"pii_presence": "no_pii",
|
| 166 |
+
"regional_relevance": [
|
| 167 |
+
"global"
|
| 168 |
+
],
|
| 169 |
+
"country_relevance": [
|
| 170 |
+
"none"
|
| 171 |
+
]
|
| 172 |
+
}
|
| 173 |
+
```
|
| 174 |
+
</details>
|
| 175 |
+
|
| 176 |
+
## Usage
|
| 177 |
+
See `propella.py` for prompts and schemas. We recommend enforcing a strict json schema without any whitespace for error-free generation.
|
| 178 |
+
|
| 179 |
+
### Serving
|
| 180 |
+
We recommend serving propella models with [SGLang](https://github.com/sgl-project/sglang) and the [llguidance](https://github.com/guidance-ai/llguidance) structured output backend:
|
| 181 |
+
```bash
|
| 182 |
+
python -m sglang.launch_server \
|
| 183 |
+
--model-path outputs/propella-1-4b \
|
| 184 |
+
--host 0.0.0.0 \
|
| 185 |
+
--port 8000 \
|
| 186 |
+
--context-length 65536 \
|
| 187 |
+
--max-running-requests 256 \
|
| 188 |
+
--chunked-prefill-size 8192 \
|
| 189 |
+
--enable-mixed-chunk \
|
| 190 |
+
--num-continuous-decode-steps 8 \
|
| 191 |
+
--grammar-backend llguidance \
|
| 192 |
+
--mem-fraction-static 0.7
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
<details>
|
| 196 |
+
<summary>fp8 on H100</summary>
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
python -m sglang.launch_server \
|
| 200 |
+
--model-path outputs/propella-1-4b \
|
| 201 |
+
--quantization w8a8_fp8 \
|
| 202 |
+
--kv-cache-dtype fp8_e4m3 \
|
| 203 |
+
--host 0.0.0.0 \
|
| 204 |
+
--port 8000 \
|
| 205 |
+
--context-length 65536 \
|
| 206 |
+
--max-running-requests 256 \
|
| 207 |
+
--chunked-prefill-size 8192 \
|
| 208 |
+
--enable-mixed-chunk \
|
| 209 |
+
--num-continuous-decode-steps 8 \
|
| 210 |
+
--grammar-backend llguidance \
|
| 211 |
+
--mem-fraction-static 0.7
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
</details>
|
| 215 |
+
|
| 216 |
+
For single-node multi-GPU we recommend increasing `data-parallel-size`.
|
| 217 |
+
For large scale offline inference on SLURM clusters we use [inference-hive](https://github.com/ellamind/inference-hive).
|
| 218 |
+
|
| 219 |
+
### Sending request via OpenAI SDK
|
| 220 |
+
```python
|
| 221 |
+
from openai import OpenAI
|
| 222 |
+
from propella import (
|
| 223 |
+
create_messages,
|
| 224 |
+
AnnotationResponse,
|
| 225 |
+
get_annotation_response_schema,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
document = "Hi, its me Max."
|
| 229 |
+
|
| 230 |
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
|
| 231 |
+
|
| 232 |
+
response = client.chat.completions.create(
|
| 233 |
+
model="ellamind/propella-1-4b",
|
| 234 |
+
messages=create_messages(document),
|
| 235 |
+
response_format={
|
| 236 |
+
"type": "json_schema",
|
| 237 |
+
"json_schema": {
|
| 238 |
+
"name": "AnnotationResponse",
|
| 239 |
+
"schema": get_annotation_response_schema(flatten=True, compact_whitespace=True),
|
| 240 |
+
"strict": True,
|
| 241 |
+
}
|
| 242 |
+
},
|
| 243 |
+
)
|
| 244 |
+
response_content = response.choices[0].message.content
|
| 245 |
+
result = AnnotationResponse.model_validate_json(response_content)
|
| 246 |
+
print(result.model_dump_json(indent=4))
|
| 247 |
+
```
|
| 248 |
+
<details>
|
| 249 |
+
<summary>Result</summary>
|
| 250 |
+
|
| 251 |
+
```json
|
| 252 |
+
{
|
| 253 |
+
"content_integrity": "complete",
|
| 254 |
+
"content_ratio": "complete_content",
|
| 255 |
+
"content_length": "minimal",
|
| 256 |
+
"one_sentence_description": "A short personal greeting introducing someone named Max.",
|
| 257 |
+
"content_type": [
|
| 258 |
+
"conversational"
|
| 259 |
+
],
|
| 260 |
+
"business_sector": [
|
| 261 |
+
"general_interest"
|
| 262 |
+
],
|
| 263 |
+
"technical_content": [
|
| 264 |
+
"non_technical"
|
| 265 |
+
],
|
| 266 |
+
"information_density": "dense",
|
| 267 |
+
"content_quality": "good",
|
| 268 |
+
"audience_level": "general",
|
| 269 |
+
"commercial_bias": "none",
|
| 270 |
+
"time_sensitivity": "evergreen",
|
| 271 |
+
"content_safety": "safe",
|
| 272 |
+
"educational_value": "none",
|
| 273 |
+
"reasoning_indicators": "none",
|
| 274 |
+
"pii_presence": "contains_pii",
|
| 275 |
+
"regional_relevance": [
|
| 276 |
+
"culturally_neutral"
|
| 277 |
+
],
|
| 278 |
+
"country_relevance": [
|
| 279 |
+
"none"
|
| 280 |
+
]
|
| 281 |
+
}
|
| 282 |
+
```
|
| 283 |
+
</details>
|
| 284 |
+
|
| 285 |
+
## Throughput
|
| 286 |
+
|
| 287 |
+
The throughput results below provide a rough estimate for GPU-hours required to annotate 1M documents. After a short warmup, we run inference for 5k documents, sending 1k concurrent requests to the SGLang server.
|
| 288 |
+
|
| 289 |
+
| Model | GPU | Docs/s | hours-per-1M docs | Prompt TPS | Output TPS | Total TPS |
|
| 290 |
+
|-------|-----|--------|-------------------|------------|------------|-----------|
|
| 291 |
+
| propella-1-4b | A100 80GB | 10.3 | 27.0 | 19.1k | 1.5k | 20.5k |
|
| 292 |
+
| propella-1-4b | H100 96GB | 22.4 | 12.4 | 41.6k | 3.2k | 44.8k |
|
| 293 |
+
| propella-1-4b (fp8) | H100 96GB | 27.0 | 10.3 | 50.1k | 3.9k | 54.0k |
|
| 294 |
+
| propella-1-1.7b | A100 80GB | 17.8 | 15.6 | 33.0k | 2.6k | 35.6k |
|
| 295 |
+
| propella-1-1.7b | H100 96GB | 35.8 | 7.8 | 66.5k | 5.2k | 71.8k |
|
| 296 |
+
| propella-1-1.7b (fp8) | H100 96GB | 39.1 | 7.1 | 72.7k | 5.7k | 78.4k |
|
| 297 |
+
| propella-1-0.6b | H100 96GB | 39.9 | 7.0 | 74.2k | 5.7k | 79.9k |
|
| 298 |
+
| propella-1-0.6b | A100 80GB | 21.5 | 12.9 | 40.0k | 3.1k | 43.1k |
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
## Evaluation
|
| 302 |
+
|
| 303 |
+
We evaluate the propella-1 models on a test set containing 3k documents. For these documents we obtain annotations from Gemini-3-Pro (reasoning_effort: high), which we consider as groundtruth labels under the assumption that they represent the upper limit in terms of annotation quality.
|
| 304 |
+
|
| 305 |
+
All baseline models use the detailed annotator system- and user-prompts as defined in `propella.py`. For throughput reasons, the propella-1 models use a very short, propella-1 specific prompt. We also tested some baseline models with the propella-1 prompt, always leading to worse performance as the prompt lacks details.
|
| 306 |
+
|
| 307 |
+
### Metrics by Property Type
|
| 308 |
+
|
| 309 |
+
Properties are grouped into three categories, each evaluated with an appropriate metric:
|
| 310 |
+
|
| 311 |
+
- **Ordinal Properties** (11 properties): **QWK** (Quadratic Weighted Kappa), which measures agreement while accounting for the ordinal nature of labels. It penalizes larger disagreements more heavily.
|
| 312 |
+
- **Binary Properties** (1 property): **F1**, the harmonic mean of precision and recall.
|
| 313 |
+
- **Multi-select Properties** (5 properties): **IoU** (Jaccard Index), intersection-over-union averaged across samples.
|
| 314 |
+
- **Free-text Properties** (1 property): The `one_sentence_description` property is excluded from quantitative evaluation.
|
| 315 |
+
|
| 316 |
+
<p align="center">
|
| 317 |
+
<a href="res/per_property_scores_by_model.png">
|
| 318 |
+
<img src="res/per_property_scores_by_model.png" alt="per-property-performance-plot">
|
| 319 |
+
</a>
|
| 320 |
+
</p>
|
| 321 |
+
|
| 322 |
+
### Overall Score
|
| 323 |
+
|
| 324 |
+
The overall score is a weighted average of the primary metric for each property type:
|
| 325 |
+
|
| 326 |
+
```
|
| 327 |
+
overall = (11/17 × avg_QWK) + (1/17 × avg_F1) + (5/17 × avg_IoU)
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
<p align="center">
|
| 331 |
+
<a href="res/overall_scores_by_model.png">
|
| 332 |
+
<img src="res/overall_scores_by_model.png" alt="overall-performance-plot">
|
| 333 |
+
</a>
|
| 334 |
+
</p>
|
| 335 |
+
|
| 336 |
+
## Language Support
|
| 337 |
+
The training data for propella-1 contains documents in 57 languages:
|
| 338 |
+
| lang_script | percent |
|
| 339 |
+
|-------------|---------|
|
| 340 |
+
| eng_Latn | 35.08 |
|
| 341 |
+
| spa_Latn | 3.98 |
|
| 342 |
+
| ita_Latn | 3.97 |
|
| 343 |
+
| fra_Latn | 3.95 |
|
| 344 |
+
| deu_Latn | 3.86 |
|
| 345 |
+
| pol_Latn | 3.81 |
|
| 346 |
+
| code | 2.82 |
|
| 347 |
+
| math | 2.77 |
|
| 348 |
+
| sft | 2.41 |
|
| 349 |
+
| ukr_Cyrl | 0.95 |
|
| 350 |
+
| nld_Latn | 0.95 |
|
| 351 |
+
| tha_Thai | 0.95 |
|
| 352 |
+
| jpn_Jpan | 0.94 |
|
| 353 |
+
| heb_Hebr | 0.94 |
|
| 354 |
+
| ell_Grek | 0.93 |
|
| 355 |
+
| kor_Hang | 0.93 |
|
| 356 |
+
| isl_Latn | 0.93 |
|
| 357 |
+
| dan_Latn | 0.92 |
|
| 358 |
+
| cat_Latn | 0.92 |
|
| 359 |
+
| slk_Latn | 0.92 |
|
| 360 |
+
| rus_Cyrl | 0.91 |
|
| 361 |
+
| kat_Geor | 0.9 |
|
| 362 |
+
| por_Latn | 0.9 |
|
| 363 |
+
| ben_Beng | 0.9 |
|
| 364 |
+
| fas_Arab | 0.89 |
|
| 365 |
+
| ekk_Latn | 0.89 |
|
| 366 |
+
| fin_Latn | 0.89 |
|
| 367 |
+
| tur_Latn | 0.89 |
|
| 368 |
+
| swe_Latn | 0.88 |
|
| 369 |
+
| ind_Latn | 0.88 |
|
| 370 |
+
| ces_Latn | 0.88 |
|
| 371 |
+
| lit_Latn | 0.88 |
|
| 372 |
+
| slv_Latn | 0.87 |
|
| 373 |
+
| vie_Latn | 0.87 |
|
| 374 |
+
| eus_Latn | 0.87 |
|
| 375 |
+
| bul_Cyrl | 0.86 |
|
| 376 |
+
| mlt_Latn | 0.86 |
|
| 377 |
+
| lvs_Latn | 0.86 |
|
| 378 |
+
| nob_Latn | 0.86 |
|
| 379 |
+
| hun_Latn | 0.85 |
|
| 380 |
+
| urd_Arab | 0.85 |
|
| 381 |
+
| ron_Latn | 0.84 |
|
| 382 |
+
| glg_Latn | 0.83 |
|
| 383 |
+
| gle_Latn | 0.83 |
|
| 384 |
+
| nno_Latn | 0.83 |
|
| 385 |
+
| ltg_Latn | 0.77 |
|
| 386 |
+
| yue_Hant | 0.49 |
|
| 387 |
+
| cmn_Hant | 0.48 |
|
| 388 |
+
| hrv_Latn | 0.43 |
|
| 389 |
+
| arb_Arab | 0.39 |
|
| 390 |
+
| bos_Latn | 0.39 |
|
| 391 |
+
| mkd_Cyrl | 0.39 |
|
| 392 |
+
| srp_Latn | 0.37 |
|
| 393 |
+
| cmn_Hani | 0.37 |
|
| 394 |
+
| hin_Deva | 0.36 |
|
| 395 |
+
| srp_Cyrl | 0.36 |
|
| 396 |
+
| als_Latn | 0.35 |
|
| 397 |
+
| sqi_Latn | 0.03 |
|
| 398 |
+
| est_Latn | 0.02 |
|
| 399 |
+
| nor_Latn | 0.02 |
|
| 400 |
+
| lav_Latn | 0.02 |
|
| 401 |
+
| swa_Latn | 0.02 |
|
| 402 |
+
|
| 403 |
+
## Acknowledgements
|
| 404 |
+
* This project is supported by the OpenEuroLLM project, co-funded by the Digital Europe Programme under GA no. 101195233. For more information see [openeurollm.eu](openeurollm.eu).
|
| 405 |
+
* This project is supported by the LLMs4EU project, co-funded by the Digital Europe Programme under GA no. 101198470. For more information see [LLMs4EU website](https://www.alt-edic.eu/projects/llms4eu/).
|
| 406 |
+
* This project is supported by the German Federal Ministry for Economic Affairs and Energy (BMWE) under the soofi (Sovereign Open Source Foundation Models for European Intelligence) project.
|
| 407 |
+
* We acknowledge the EuroHPC Joint Undertaking for supporting this project through access to the EuroHPC supercomputer LEONARDO, hosted by CINECA (Italy) and the LEONARDO consortium, through an EuroHPC AI Factory Large Scale Access call.
|
| 408 |
+
* We thank the AI Service Center for Sensitive and Critical Infrastructures (KISSKI), hosted by GWDG, for additional compute access.
|
| 409 |
+
|
| 410 |
+
<img src="res/eu_cofunding.png" alt="eu-cofunding-logo" width="300" style="vertical-align: middle;">
|
added_tokens.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</think>": 151668,
|
| 3 |
+
"</tool_call>": 151658,
|
| 4 |
+
"</tool_response>": 151666,
|
| 5 |
+
"<think>": 151667,
|
| 6 |
+
"<tool_call>": 151657,
|
| 7 |
+
"<tool_response>": 151665,
|
| 8 |
+
"<|box_end|>": 151649,
|
| 9 |
+
"<|box_start|>": 151648,
|
| 10 |
+
"<|endoftext|>": 151643,
|
| 11 |
+
"<|file_sep|>": 151664,
|
| 12 |
+
"<|fim_middle|>": 151660,
|
| 13 |
+
"<|fim_pad|>": 151662,
|
| 14 |
+
"<|fim_prefix|>": 151659,
|
| 15 |
+
"<|fim_suffix|>": 151661,
|
| 16 |
+
"<|im_end|>": 151645,
|
| 17 |
+
"<|im_start|>": 151644,
|
| 18 |
+
"<|image_pad|>": 151655,
|
| 19 |
+
"<|object_ref_end|>": 151647,
|
| 20 |
+
"<|object_ref_start|>": 151646,
|
| 21 |
+
"<|quad_end|>": 151651,
|
| 22 |
+
"<|quad_start|>": 151650,
|
| 23 |
+
"<|repo_name|>": 151663,
|
| 24 |
+
"<|video_pad|>": 151656,
|
| 25 |
+
"<|vision_end|>": 151653,
|
| 26 |
+
"<|vision_pad|>": 151654,
|
| 27 |
+
"<|vision_start|>": 151652
|
| 28 |
+
}
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if messages[0].role == 'system' %}
|
| 2 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 3 |
+
{%- endif %}
|
| 4 |
+
{%- for message in messages %}
|
| 5 |
+
{%- if message.role == "user" or (message.role == "system" and not loop.first) %}
|
| 6 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
|
| 7 |
+
{%- elif message.role == "assistant" %}
|
| 8 |
+
{{- '<|im_start|>assistant\n' + message.content + '<|im_end|>\n' }}
|
| 9 |
+
{%- endif %}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{%- if add_generation_prompt %}
|
| 12 |
+
{{- '<|im_start|>assistant\n' }}
|
| 13 |
+
{%- endif %}
|
config.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"dtype": "bfloat16",
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 2560,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 9728,
|
| 14 |
+
"layer_types": [
|
| 15 |
+
"full_attention",
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"full_attention",
|
| 45 |
+
"full_attention",
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention"
|
| 51 |
+
],
|
| 52 |
+
"max_position_embeddings": 262144,
|
| 53 |
+
"max_window_layers": 36,
|
| 54 |
+
"model_type": "qwen3",
|
| 55 |
+
"num_attention_heads": 32,
|
| 56 |
+
"num_hidden_layers": 36,
|
| 57 |
+
"num_key_value_heads": 8,
|
| 58 |
+
"pad_token_id": 151643,
|
| 59 |
+
"rms_norm_eps": 1e-06,
|
| 60 |
+
"rope_scaling": null,
|
| 61 |
+
"rope_theta": 5000000,
|
| 62 |
+
"sliding_window": null,
|
| 63 |
+
"tie_word_embeddings": true,
|
| 64 |
+
"transformers_version": "4.57.1",
|
| 65 |
+
"use_cache": false,
|
| 66 |
+
"use_sliding_window": false,
|
| 67 |
+
"vocab_size": 151936
|
| 68 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_sample": true,
|
| 3 |
+
"eos_token_id": [
|
| 4 |
+
151645,
|
| 5 |
+
151643
|
| 6 |
+
],
|
| 7 |
+
"pad_token_id": 151643,
|
| 8 |
+
"temperature": 0.7,
|
| 9 |
+
"top_k": 20,
|
| 10 |
+
"top_p": 0.8,
|
| 11 |
+
"transformers_version": "4.57.1"
|
| 12 |
+
}
|
inference_example.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
+
from propella import (
|
| 3 |
+
create_messages,
|
| 4 |
+
AnnotationResponse,
|
| 5 |
+
get_annotation_response_schema,
|
| 6 |
+
)
|
| 7 |
+
|
| 8 |
+
document = "Hi, its me Max."
|
| 9 |
+
|
| 10 |
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
|
| 11 |
+
|
| 12 |
+
response = client.chat.completions.create(
|
| 13 |
+
model="ellamind/propella-1-4b",
|
| 14 |
+
messages=create_messages(document),
|
| 15 |
+
response_format={
|
| 16 |
+
"type": "json_schema",
|
| 17 |
+
"json_schema": {
|
| 18 |
+
"name": "AnnotationResponse",
|
| 19 |
+
"schema": get_annotation_response_schema(flatten=True, compact_whitespace=True),
|
| 20 |
+
"strict": True,
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
)
|
| 24 |
+
response_content = response.choices[0].message.content
|
| 25 |
+
result = AnnotationResponse.model_validate_json(response_content)
|
| 26 |
+
print(result.model_dump_json(indent=4))
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:593a0cf241a56e7f0e830365bcc9e72f128b3a2cc56a8994773467cc573ff02b
|
| 3 |
+
size 4967215360
|
model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:787b95bc9da1801e0078afa670576b9b9701c420574b2c65af16101a28a94eb9
|
| 3 |
+
size 3855679144
|
model.safetensors.index.json
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 8822848512
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"lm_head.weight": "model-00002-of-00002.safetensors",
|
| 7 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 19 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 37 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 41 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 46 |
+
"model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 49 |
+
"model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 53 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 55 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 58 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 60 |
+
"model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 65 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 67 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 69 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 71 |
+
"model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 74 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 76 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 77 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 78 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 79 |
+
"model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 81 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 83 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 85 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 86 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 87 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 89 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 90 |
+
"model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 91 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 92 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 94 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 97 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 98 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 99 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 101 |
+
"model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 103 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 106 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 108 |
+
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 110 |
+
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 113 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 115 |
+
"model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 117 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 122 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 129 |
+
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 131 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 133 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 134 |
+
"model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 135 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 137 |
+
"model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 142 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 145 |
+
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 149 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 151 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 152 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 153 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 154 |
+
"model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 156 |
+
"model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 158 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 161 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 163 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 164 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 165 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 166 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 167 |
+
"model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 168 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 169 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 170 |
+
"model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 171 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 172 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 173 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 174 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 175 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 176 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 177 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 178 |
+
"model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 179 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 180 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 181 |
+
"model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 182 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 183 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 184 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 185 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 186 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 187 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 188 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 189 |
+
"model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 190 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 191 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 192 |
+
"model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 193 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 194 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 195 |
+
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 196 |
+
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 197 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 198 |
+
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 199 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 200 |
+
"model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 201 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 202 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 203 |
+
"model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 204 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 205 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 206 |
+
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 207 |
+
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 208 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 209 |
+
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 210 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 211 |
+
"model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 212 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 213 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 214 |
+
"model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 215 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 216 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 217 |
+
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 218 |
+
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 219 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 220 |
+
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 221 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 222 |
+
"model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 223 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 224 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 225 |
+
"model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 226 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 227 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 228 |
+
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 229 |
+
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 230 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 231 |
+
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 232 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 233 |
+
"model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 234 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 235 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 236 |
+
"model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 237 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 238 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 239 |
+
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 240 |
+
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 241 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 242 |
+
"model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 243 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 244 |
+
"model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 245 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 246 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 247 |
+
"model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 248 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 249 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 250 |
+
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 251 |
+
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 252 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 253 |
+
"model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 254 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 255 |
+
"model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 256 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 257 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 258 |
+
"model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 259 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 260 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 261 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 262 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 263 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 264 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 265 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 266 |
+
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 267 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 268 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 269 |
+
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 270 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 271 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 272 |
+
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 273 |
+
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 274 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 275 |
+
"model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 276 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 277 |
+
"model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 278 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 279 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 280 |
+
"model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 281 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 282 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 283 |
+
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 284 |
+
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 285 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 286 |
+
"model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 287 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 288 |
+
"model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 289 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 290 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 291 |
+
"model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 292 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 293 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 294 |
+
"model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 295 |
+
"model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 296 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 297 |
+
"model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 298 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 299 |
+
"model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 300 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 301 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 302 |
+
"model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 303 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 304 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 305 |
+
"model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 306 |
+
"model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 307 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 308 |
+
"model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 309 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 310 |
+
"model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 311 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 312 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 313 |
+
"model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 314 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 315 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 316 |
+
"model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 317 |
+
"model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 318 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 319 |
+
"model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 320 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 321 |
+
"model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 322 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 323 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 324 |
+
"model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 325 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 326 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 327 |
+
"model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 328 |
+
"model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 329 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 330 |
+
"model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 331 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 332 |
+
"model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 333 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 334 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 335 |
+
"model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 336 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 337 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 338 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 339 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 340 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 341 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 342 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 343 |
+
"model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 344 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 345 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 346 |
+
"model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 347 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 348 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 349 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 350 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 351 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 352 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 353 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 354 |
+
"model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 355 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 356 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 357 |
+
"model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 358 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 359 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 360 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 361 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 362 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 363 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 364 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 365 |
+
"model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 366 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 367 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 368 |
+
"model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 369 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 370 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 371 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 372 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 373 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 374 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 375 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 376 |
+
"model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 377 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 378 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 379 |
+
"model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 380 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 381 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 382 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 383 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 384 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 385 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 386 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 387 |
+
"model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 388 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 389 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 390 |
+
"model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 391 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 392 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 393 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 394 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 395 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 396 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 397 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 398 |
+
"model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 399 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 400 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 401 |
+
"model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 402 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 403 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 404 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
| 405 |
+
}
|
| 406 |
+
}
|
propella.py
ADDED
|
@@ -0,0 +1,916 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from copy import deepcopy
|
| 3 |
+
from enum import Enum
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Type, Union
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 8 |
+
|
| 9 |
+
SYSTEM_PROMPT = """Annotate the document. Any language; assess quality within its linguistic norms. Respond with a JSON object:
|
| 10 |
+
content_integrity: technical completeness (complete|mostly_complete|fragment|severely_degraded)
|
| 11 |
+
content_ratio: content vs navigation/boilerplate ratio (complete_content|mostly_content|mixed_content|mostly_navigation|minimal_content)
|
| 12 |
+
content_length: substantive words (substantial 2k+|moderate 500-2k|brief 100-500|minimal <100)
|
| 13 |
+
one_sentence_description: neutral ~10 word summary in English
|
| 14 |
+
content_type[]: functional purpose (analytical|instructional|reference|procedural|qa_structured|conversational|creative|transactional|boilerplate|news_report|opinion_editorial|review_critique|technical_documentation|specification_standard|legal_document|press_release|structured_data|source_code)
|
| 15 |
+
business_sector[]: industry domain (academic_research|education_sector|technology_software|hardware_electronics|healthcare_medical|pharmaceutical_biotech|financial_services|legal_services|government_public|manufacturing_industrial|mining_resources|chemicals_materials|energy_utilities|retail_commerce|wholesale_distribution|real_estate_construction|transportation_logistics|automotive_industry|telecommunications|media_entertainment|advertising_marketing|hospitality_tourism|agriculture_food|environmental_services|aerospace_defense|insurance_industry|nonprofit_ngo|consulting_professional|human_resources|security_cyber|gaming_industry|gambling_betting|travel_aviation|food_beverage_hospitality|consumer_goods|general_interest|other)
|
| 16 |
+
technical_content[]: specialized knowledge (code_heavy|math_heavy|scientific|data_heavy|engineering|basic_technical|non_technical)
|
| 17 |
+
content_quality: writing/presentation quality (excellent|good|adequate|poor|unacceptable)
|
| 18 |
+
information_density: signal vs padding (dense|adequate|moderate|thin|empty)
|
| 19 |
+
educational_value: teaching potential (high|moderate|basic|minimal|none)
|
| 20 |
+
reasoning_indicators: logical analysis depth (analytical|explanatory|basic_reasoning|minimal|none)
|
| 21 |
+
audience_level: assumed background (expert|advanced|general|beginner|youth|children)
|
| 22 |
+
commercial_bias: promotional influence (none|minimal|moderate|heavy|pure_marketing)
|
| 23 |
+
time_sensitivity: temporal decay (evergreen|slowly_changing|regularly_updating|time_sensitive)
|
| 24 |
+
content_safety: harmful content (safe|mild_concerns|nsfw|harmful|illegal)
|
| 25 |
+
pii_presence: private individual data (no_pii|contains_pii)
|
| 26 |
+
regional_relevance[]: geographic/cultural context (european|north_american|east_asian|south_asian|southeast_asian|middle_eastern|sub_saharan_african|latin_american|oceanian|central_asian|russian_sphere|global|culturally_neutral|indeterminate)
|
| 27 |
+
country_relevance[]: specific countries as ISO names, or supranational|none
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
USER_PROMPT = """<start_of_document>
|
| 31 |
+
{content}
|
| 32 |
+
<end_of_document>
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
ANNOTATOR_SYSTEM_PROMPT = """You are an expert content analysis assistant specializing in document annotations for LLM pretraining data. Your team is curating a multilingual dataset for language model training. Your task is to analyze documents and annotate them with specific properties that will later on be used to filter the dataset. The user will provide a document inside of "<start_of_document>" and "<end_of_document>" tags. Analyze the content of the document systematically and objectively. Respond with your annotations in JSON format, following the annotation framework below.
|
| 36 |
+
|
| 37 |
+
# Annotation Framework
|
| 38 |
+
## Output Requirements
|
| 39 |
+
- You must respond with a JSON object that matches the specified schema.
|
| 40 |
+
- Use the exact enum values provided in the property descriptions.
|
| 41 |
+
- Ensure all fields are included.
|
| 42 |
+
- For multi-select properties, always return arrays (even if only one value applies). Multi-select fields: content_type, business_sector, technical_content, regional_relevance, country_relevance. All other properties are single-select strings.
|
| 43 |
+
- Do not include any explanatory text, comments, or additional formatting.
|
| 44 |
+
|
| 45 |
+
## Key Principles
|
| 46 |
+
* Objective assessment: Base decisions on clear criteria, not subjective preferences.
|
| 47 |
+
* Completeness: Address all properties for every document.
|
| 48 |
+
* Consistency: Apply the same standards across all documents.
|
| 49 |
+
* Multilinguality: The user provided document can be in any language, the language itself should not influence the annotations.
|
| 50 |
+
|
| 51 |
+
## Properties to Annotate
|
| 52 |
+
The annotation framework evaluates documents across 18 key properties organized into six main categories:
|
| 53 |
+
|
| 54 |
+
**Core Content Properties:**
|
| 55 |
+
- Content Integrity: Completeness and technical quality (complete, mostly_complete, fragment, severely_degraded)
|
| 56 |
+
- Content Ratio: Proportion of meaningful content vs navigation/UI elements (complete_content, mostly_content, mixed_content, mostly_navigation, minimal_content)
|
| 57 |
+
- Content Length: Amount of substantive content (substantial, moderate, brief, minimal)
|
| 58 |
+
|
| 59 |
+
**Content Classification:**
|
| 60 |
+
- One-Sentence Description: Ultra-short neutral description; exactly one sentence; target 8–15 words (soft max 20)
|
| 61 |
+
- Content Type: Functional structure and purpose (analytical, instructional, reference, procedural, qa_structured, conversational, creative, transactional, boilerplate, news_report, opinion_editorial, review_critique, technical_documentation, specification_standard, legal_document, press_release, structured_data, source_code)
|
| 62 |
+
- Business Sector: Industry domain relevance (see Detailed Property Descriptions for exact enum values)
|
| 63 |
+
- Technical Content: Type and intensity of specialized knowledge (code_heavy, math_heavy, scientific, data_heavy, engineering, basic_technical, non_technical)
|
| 64 |
+
|
| 65 |
+
**Quality and Value Assessment:**
|
| 66 |
+
- Content Quality: Overall writing and presentation quality (excellent, good, adequate, poor, unacceptable)
|
| 67 |
+
- Information Density: Ratio of valuable information to redundancy (dense, adequate, moderate, thin, empty)
|
| 68 |
+
- Educational Value: Potential for teaching and learning (high, moderate, basic, minimal, none)
|
| 69 |
+
- Reasoning Indicators: Presence of logical reasoning and analysis (analytical, explanatory, basic_reasoning, minimal, none)
|
| 70 |
+
|
| 71 |
+
**Audience and Purpose:**
|
| 72 |
+
- Audience Level: Target sophistication level (expert, advanced, general, beginner, youth, children)
|
| 73 |
+
- Commercial Bias: Commercial influence on objectivity (none, minimal, moderate, heavy, pure_marketing)
|
| 74 |
+
- Time-Sensitivity: How content value changes over time (evergreen, slowly_changing, regularly_updating, time_sensitive)
|
| 75 |
+
|
| 76 |
+
**Safety and Compliance:**
|
| 77 |
+
- Content Safety: Presence of inappropriate or harmful content (safe, mild_concerns, nsfw, harmful, illegal)
|
| 78 |
+
- PII Presence: Contains personally identifiable information (no_pii, contains_pii)
|
| 79 |
+
|
| 80 |
+
**Geographic Relevance:**
|
| 81 |
+
- Regional Relevance: Primary regional context (european, north_american, east_asian, south_asian, southeast_asian, middle_eastern, sub_saharan_african, latin_american, oceanian, central_asian, russian_sphere, global, culturally_neutral, indeterminate)
|
| 82 |
+
- Country Relevance: Specific country relevance (array of country names or special values: "supranational", "none")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
{property_descriptions}
|
| 86 |
+
|
| 87 |
+
## JSON Schema for the Response
|
| 88 |
+
Return a single JSON object that strictly conforms to the following JSON Schema:
|
| 89 |
+
```json
|
| 90 |
+
{json_schema}
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## Multilingual Annotation Guidelines
|
| 94 |
+
|
| 95 |
+
### Universal Principles
|
| 96 |
+
1. **Evaluate content quality within language context** - Don't penalize non-English content for being non-English
|
| 97 |
+
2. **Consider linguistic norms** - Writing styles, sentence lengths, and paragraph structures vary by language
|
| 98 |
+
3. **Respect script directionality** - RTL languages (Arabic, Hebrew) may have different navigation patterns
|
| 99 |
+
4. **Account for morphological complexity** - Agglutinative/polysynthetic languages pack more information per word
|
| 100 |
+
|
| 101 |
+
### Language-Specific Considerations
|
| 102 |
+
|
| 103 |
+
Here are some examples of language-specific considerations:
|
| 104 |
+
|
| 105 |
+
**Chinese/Japanese:**
|
| 106 |
+
- Character count more relevant than word count
|
| 107 |
+
- Lack of spaces between words is normal
|
| 108 |
+
- Mixed script usage (especially Japanese) is standard
|
| 109 |
+
|
| 110 |
+
**Arabic/Hebrew/Persian:**
|
| 111 |
+
- RTL text direction affects layout assessment
|
| 112 |
+
- Diacritical marks may be absent in informal content
|
| 113 |
+
- Mixed Arabic/English is common in technical content
|
| 114 |
+
|
| 115 |
+
**Indian Languages (Hindi, Bengali, Tamil, etc.):**
|
| 116 |
+
- Code-mixing with English is extremely common and acceptable
|
| 117 |
+
- Technical terms often borrowed from English
|
| 118 |
+
- Multiple scripts may appear in same document
|
| 119 |
+
|
| 120 |
+
**European Languages:**
|
| 121 |
+
- Formal/informal distinctions (tu/vous, du/Sie) indicate audience
|
| 122 |
+
- Compound words affect word count metrics
|
| 123 |
+
- Regional variants (Brazilian vs European Portuguese, Spanish vs Catalan, etc.) are both valid
|
| 124 |
+
|
| 125 |
+
# Annotation Workflow
|
| 126 |
+
- The user will provide a document in "<start_of_document>" and "<end_of_document>" tags. Analyze the content of the document systematically and objectively
|
| 127 |
+
- You must respond with a valid JSON object that matches the schema above.
|
| 128 |
+
- Use the exact enum values provided in the property descriptions
|
| 129 |
+
- Ensure all required fields are included
|
| 130 |
+
- For multi-select properties, always return arrays, even if only one value applies (content_type, business_sector, technical_content, regional_relevance, country_relevance). All other properties are single-select strings.
|
| 131 |
+
- Do not include any explanatory text, comments, or formatting
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
ANNOTATOR_USER_PROMPT = """Analyze the following document and provide annotations in JSON format according to the annotation framework. Return only the JSON object.
|
| 135 |
+
<start_of_document>
|
| 136 |
+
{content}
|
| 137 |
+
<end_of_document>"""
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# Default max length for one_sentence_description field
|
| 141 |
+
ONE_SENTENCE_DESCRIPTION_MAX_LENGTH = 200
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class ContentIntegrity(str, Enum):
|
| 145 |
+
"""Content completeness and technical quality"""
|
| 146 |
+
COMPLETE = "complete"
|
| 147 |
+
MOSTLY_COMPLETE = "mostly_complete"
|
| 148 |
+
FRAGMENT = "fragment"
|
| 149 |
+
SEVERELY_DEGRADED = "severely_degraded"
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
class ContentRatio(str, Enum):
|
| 153 |
+
"""Ratio of meaningful content vs navigation/UI elements"""
|
| 154 |
+
COMPLETE_CONTENT = "complete_content"
|
| 155 |
+
MOSTLY_CONTENT = "mostly_content"
|
| 156 |
+
MIXED_CONTENT = "mixed_content"
|
| 157 |
+
MOSTLY_NAVIGATION = "mostly_navigation"
|
| 158 |
+
MINIMAL_CONTENT = "minimal_content"
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class ContentLength(str, Enum):
|
| 162 |
+
"""Amount of substantive content"""
|
| 163 |
+
SUBSTANTIAL = "substantial" # 500+ words
|
| 164 |
+
MODERATE = "moderate" # 100-500 words
|
| 165 |
+
BRIEF = "brief" # 20-100 words
|
| 166 |
+
MINIMAL = "minimal" # <20 words
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class ContentType(str, Enum):
|
| 170 |
+
"""Primary purpose and type of content"""
|
| 171 |
+
ANALYTICAL = "analytical"
|
| 172 |
+
INSTRUCTIONAL = "instructional"
|
| 173 |
+
REFERENCE = "reference"
|
| 174 |
+
PROCEDURAL = "procedural"
|
| 175 |
+
QA_STRUCTURED = "qa_structured"
|
| 176 |
+
CONVERSATIONAL = "conversational"
|
| 177 |
+
CREATIVE = "creative"
|
| 178 |
+
TRANSACTIONAL = "transactional"
|
| 179 |
+
BOILERPLATE = "boilerplate"
|
| 180 |
+
NEWS_REPORT = "news_report"
|
| 181 |
+
OPINION_EDITORIAL = "opinion_editorial"
|
| 182 |
+
REVIEW_CRITIQUE = "review_critique"
|
| 183 |
+
TECHNICAL_DOCUMENTATION = "technical_documentation"
|
| 184 |
+
SPECIFICATION_STANDARD = "specification_standard"
|
| 185 |
+
LEGAL_DOCUMENT = "legal_document"
|
| 186 |
+
PRESS_RELEASE = "press_release"
|
| 187 |
+
STRUCTURED_DATA = "structured_data"
|
| 188 |
+
SOURCE_CODE = "source_code"
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class BusinessSector(str, Enum):
|
| 192 |
+
"""Industry domain(s) for sector classification (multi-select)"""
|
| 193 |
+
ACADEMIC_RESEARCH = "academic_research"
|
| 194 |
+
EDUCATION_SECTOR = "education_sector"
|
| 195 |
+
TECHNOLOGY_SOFTWARE = "technology_software"
|
| 196 |
+
HARDWARE_ELECTRONICS = "hardware_electronics"
|
| 197 |
+
HEALTHCARE_MEDICAL = "healthcare_medical"
|
| 198 |
+
PHARMACEUTICAL_BIOTECH = "pharmaceutical_biotech"
|
| 199 |
+
FINANCIAL_SERVICES = "financial_services"
|
| 200 |
+
LEGAL_SERVICES = "legal_services"
|
| 201 |
+
GOVERNMENT_PUBLIC = "government_public"
|
| 202 |
+
MANUFACTURING_INDUSTRIAL = "manufacturing_industrial"
|
| 203 |
+
MINING_RESOURCES = "mining_resources"
|
| 204 |
+
CHEMICALS_MATERIALS = "chemicals_materials"
|
| 205 |
+
ENERGY_UTILITIES = "energy_utilities"
|
| 206 |
+
RETAIL_COMMERCE = "retail_commerce"
|
| 207 |
+
WHOLESALE_DISTRIBUTION = "wholesale_distribution"
|
| 208 |
+
REAL_ESTATE_CONSTRUCTION = "real_estate_construction"
|
| 209 |
+
TRANSPORTATION_LOGISTICS = "transportation_logistics"
|
| 210 |
+
AUTOMOTIVE_INDUSTRY = "automotive_industry"
|
| 211 |
+
TELECOMMUNICATIONS = "telecommunications"
|
| 212 |
+
MEDIA_ENTERTAINMENT = "media_entertainment"
|
| 213 |
+
ADVERTISING_MARKETING = "advertising_marketing"
|
| 214 |
+
HOSPITALITY_TOURISM = "hospitality_tourism"
|
| 215 |
+
AGRICULTURE_FOOD = "agriculture_food"
|
| 216 |
+
ENVIRONMENTAL_SERVICES = "environmental_services"
|
| 217 |
+
AEROSPACE_DEFENSE = "aerospace_defense"
|
| 218 |
+
INSURANCE_INDUSTRY = "insurance_industry"
|
| 219 |
+
NONPROFIT_NGO = "nonprofit_ngo"
|
| 220 |
+
CONSULTING_PROFESSIONAL = "consulting_professional"
|
| 221 |
+
HUMAN_RESOURCES = "human_resources"
|
| 222 |
+
SECURITY_CYBER = "security_cyber"
|
| 223 |
+
GAMING_INDUSTRY = "gaming_industry"
|
| 224 |
+
GAMBLING_BETTING = "gambling_betting"
|
| 225 |
+
TRAVEL_AVIATION = "travel_aviation"
|
| 226 |
+
FOOD_BEVERAGE_HOSPITALITY = "food_beverage_hospitality"
|
| 227 |
+
CONSUMER_GOODS = "consumer_goods"
|
| 228 |
+
GENERAL_INTEREST = "general_interest"
|
| 229 |
+
OTHER = "other"
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
class TechnicalContent(str, Enum):
|
| 233 |
+
"""Type and intensity of specialized technical knowledge"""
|
| 234 |
+
CODE_HEAVY = "code_heavy"
|
| 235 |
+
MATH_HEAVY = "math_heavy"
|
| 236 |
+
SCIENTIFIC = "scientific"
|
| 237 |
+
DATA_HEAVY = "data_heavy"
|
| 238 |
+
ENGINEERING = "engineering"
|
| 239 |
+
BASIC_TECHNICAL = "basic_technical"
|
| 240 |
+
NON_TECHNICAL = "non_technical"
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
class InformationDensity(str, Enum):
|
| 244 |
+
"""Ratio of valuable information to redundancy and padding"""
|
| 245 |
+
DENSE = "dense"
|
| 246 |
+
ADEQUATE = "adequate"
|
| 247 |
+
MODERATE = "moderate"
|
| 248 |
+
THIN = "thin"
|
| 249 |
+
EMPTY = "empty"
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
class ContentQuality(str, Enum):
|
| 253 |
+
"""Overall quality considering writing, value, and presentation"""
|
| 254 |
+
EXCELLENT = "excellent"
|
| 255 |
+
GOOD = "good"
|
| 256 |
+
ADEQUATE = "adequate"
|
| 257 |
+
POOR = "poor"
|
| 258 |
+
UNACCEPTABLE = "unacceptable"
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
class AudienceLevel(str, Enum):
|
| 262 |
+
"""Intended sophistication level and background knowledge assumptions"""
|
| 263 |
+
EXPERT = "expert"
|
| 264 |
+
ADVANCED = "advanced"
|
| 265 |
+
GENERAL = "general"
|
| 266 |
+
BEGINNER = "beginner"
|
| 267 |
+
YOUTH = "youth"
|
| 268 |
+
CHILDREN = "children"
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
class CommercialBias(str, Enum):
|
| 272 |
+
"""Commercial influence on objectivity and informational value"""
|
| 273 |
+
NONE = "none"
|
| 274 |
+
MINIMAL = "minimal"
|
| 275 |
+
MODERATE = "moderate"
|
| 276 |
+
HEAVY = "heavy"
|
| 277 |
+
PURE_MARKETING = "pure_marketing"
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
class ContentSafety(str, Enum):
|
| 281 |
+
"""Presence of inappropriate, harmful, or legally problematic content"""
|
| 282 |
+
SAFE = "safe"
|
| 283 |
+
MILD_CONCERNS = "mild_concerns"
|
| 284 |
+
NSFW = "nsfw"
|
| 285 |
+
HARMFUL = "harmful"
|
| 286 |
+
ILLEGAL = "illegal"
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
class EducationalValue(str, Enum):
|
| 290 |
+
"""Potential for teaching, learning, and knowledge transfer"""
|
| 291 |
+
HIGH = "high"
|
| 292 |
+
MODERATE = "moderate"
|
| 293 |
+
BASIC = "basic"
|
| 294 |
+
MINIMAL = "minimal"
|
| 295 |
+
NONE = "none"
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
class ReasoningIndicators(str, Enum):
|
| 299 |
+
"""Presence and quality of logical reasoning and analysis"""
|
| 300 |
+
ANALYTICAL = "analytical"
|
| 301 |
+
EXPLANATORY = "explanatory"
|
| 302 |
+
BASIC_REASONING = "basic_reasoning"
|
| 303 |
+
MINIMAL = "minimal"
|
| 304 |
+
NONE = "none"
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
class RegionalRelevance(str, Enum):
|
| 308 |
+
"""Primary regional, cultural, or geopolitical sphere(s)"""
|
| 309 |
+
EUROPEAN = "european"
|
| 310 |
+
NORTH_AMERICAN = "north_american"
|
| 311 |
+
EAST_ASIAN = "east_asian"
|
| 312 |
+
SOUTH_ASIAN = "south_asian"
|
| 313 |
+
SOUTHEAST_ASIAN = "southeast_asian"
|
| 314 |
+
MIDDLE_EASTERN = "middle_eastern"
|
| 315 |
+
SUB_SAHARAN_AFRICAN = "sub_saharan_african"
|
| 316 |
+
LATIN_AMERICAN = "latin_american"
|
| 317 |
+
OCEANIAN = "oceanian"
|
| 318 |
+
CENTRAL_ASIAN = "central_asian"
|
| 319 |
+
RUSSIAN_SPHERE = "russian_sphere"
|
| 320 |
+
GLOBAL = "global"
|
| 321 |
+
CULTURALLY_NEUTRAL = "culturally_neutral"
|
| 322 |
+
INDETERMINATE = "indeterminate"
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
class TimeSensitivity(str, Enum):
|
| 326 |
+
"""How time-sensitive the content is"""
|
| 327 |
+
EVERGREEN = "evergreen"
|
| 328 |
+
SLOWLY_CHANGING = "slowly_changing"
|
| 329 |
+
REGULARLY_UPDATING = "regularly_updating"
|
| 330 |
+
TIME_SENSITIVE = "time_sensitive"
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
class PiiPresence(str, Enum):
|
| 334 |
+
"""Presence of personally identifiable information"""
|
| 335 |
+
NO_PII = "no_pii"
|
| 336 |
+
CONTAINS_PII = "contains_pii"
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
class Country(str, Enum):
|
| 340 |
+
"""
|
| 341 |
+
Country names for country relevance classification.
|
| 342 |
+
Based on ISO 3166-1 standard - the authoritative international standard
|
| 343 |
+
for country codes maintained by the International Organization for Standardization.
|
| 344 |
+
|
| 345 |
+
Includes all 249 entities from ISO 3166-1: 193 UN member states,
|
| 346 |
+
2 UN observer states, plus dependent territories and special areas.
|
| 347 |
+
|
| 348 |
+
References:
|
| 349 |
+
- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
|
| 350 |
+
- https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area
|
| 351 |
+
"""
|
| 352 |
+
|
| 353 |
+
# UN Member States (193 total) and UN Observer States (2 total)
|
| 354 |
+
AFGHANISTAN = "afghanistan"
|
| 355 |
+
ALBANIA = "albania"
|
| 356 |
+
ALGERIA = "algeria"
|
| 357 |
+
ANDORRA = "andorra"
|
| 358 |
+
ANGOLA = "angola"
|
| 359 |
+
ANTIGUA_AND_BARBUDA = "antigua_and_barbuda"
|
| 360 |
+
ARGENTINA = "argentina"
|
| 361 |
+
ARMENIA = "armenia"
|
| 362 |
+
AUSTRALIA = "australia"
|
| 363 |
+
AUSTRIA = "austria"
|
| 364 |
+
AZERBAIJAN = "azerbaijan"
|
| 365 |
+
BAHAMAS = "bahamas"
|
| 366 |
+
BAHRAIN = "bahrain"
|
| 367 |
+
BANGLADESH = "bangladesh"
|
| 368 |
+
BARBADOS = "barbados"
|
| 369 |
+
BELARUS = "belarus"
|
| 370 |
+
BELGIUM = "belgium"
|
| 371 |
+
BELIZE = "belize"
|
| 372 |
+
BENIN = "benin"
|
| 373 |
+
BHUTAN = "bhutan"
|
| 374 |
+
BOLIVIA = "bolivia"
|
| 375 |
+
BOSNIA_AND_HERZEGOVINA = "bosnia_and_herzegovina"
|
| 376 |
+
BOTSWANA = "botswana"
|
| 377 |
+
BRAZIL = "brazil"
|
| 378 |
+
BRUNEI = "brunei"
|
| 379 |
+
BULGARIA = "bulgaria"
|
| 380 |
+
BURKINA_FASO = "burkina_faso"
|
| 381 |
+
BURUNDI = "burundi"
|
| 382 |
+
CABO_VERDE = "cabo_verde"
|
| 383 |
+
CAMBODIA = "cambodia"
|
| 384 |
+
CAMEROON = "cameroon"
|
| 385 |
+
CANADA = "canada"
|
| 386 |
+
CENTRAL_AFRICAN_REPUBLIC = "central_african_republic"
|
| 387 |
+
CHAD = "chad"
|
| 388 |
+
CHILE = "chile"
|
| 389 |
+
CHINA = "china"
|
| 390 |
+
COLOMBIA = "colombia"
|
| 391 |
+
COMOROS = "comoros"
|
| 392 |
+
CONGO = "congo"
|
| 393 |
+
CONGO_DEMOCRATIC_REPUBLIC = "congo_democratic_republic"
|
| 394 |
+
COOK_ISLANDS = "cook_islands"
|
| 395 |
+
COSTA_RICA = "costa_rica"
|
| 396 |
+
CROATIA = "croatia"
|
| 397 |
+
CUBA = "cuba"
|
| 398 |
+
CYPRUS = "cyprus"
|
| 399 |
+
CZECH_REPUBLIC = "czech_republic"
|
| 400 |
+
DENMARK = "denmark"
|
| 401 |
+
DJIBOUTI = "djibouti"
|
| 402 |
+
DOMINICA = "dominica"
|
| 403 |
+
DOMINICAN_REPUBLIC = "dominican_republic"
|
| 404 |
+
ECUADOR = "ecuador"
|
| 405 |
+
EGYPT = "egypt"
|
| 406 |
+
EL_SALVADOR = "el_salvador"
|
| 407 |
+
EQUATORIAL_GUINEA = "equatorial_guinea"
|
| 408 |
+
ERITREA = "eritrea"
|
| 409 |
+
ESTONIA = "estonia"
|
| 410 |
+
ESWATINI = "eswatini"
|
| 411 |
+
ETHIOPIA = "ethiopia"
|
| 412 |
+
FIJI = "fiji"
|
| 413 |
+
FINLAND = "finland"
|
| 414 |
+
FRANCE = "france"
|
| 415 |
+
GABON = "gabon"
|
| 416 |
+
GAMBIA = "gambia"
|
| 417 |
+
GEORGIA = "georgia"
|
| 418 |
+
GERMANY = "germany"
|
| 419 |
+
GHANA = "ghana"
|
| 420 |
+
GREECE = "greece"
|
| 421 |
+
GRENADA = "grenada"
|
| 422 |
+
GUATEMALA = "guatemala"
|
| 423 |
+
GUINEA = "guinea"
|
| 424 |
+
GUINEA_BISSAU = "guinea_bissau"
|
| 425 |
+
GUYANA = "guyana"
|
| 426 |
+
HAITI = "haiti"
|
| 427 |
+
HONDURAS = "honduras"
|
| 428 |
+
HUNGARY = "hungary"
|
| 429 |
+
ICELAND = "iceland"
|
| 430 |
+
INDIA = "india"
|
| 431 |
+
INDONESIA = "indonesia"
|
| 432 |
+
IRAN = "iran"
|
| 433 |
+
IRAQ = "iraq"
|
| 434 |
+
IRELAND = "ireland"
|
| 435 |
+
ISRAEL = "israel"
|
| 436 |
+
ITALY = "italy"
|
| 437 |
+
IVORY_COAST = "ivory_coast"
|
| 438 |
+
JAMAICA = "jamaica"
|
| 439 |
+
JAPAN = "japan"
|
| 440 |
+
JORDAN = "jordan"
|
| 441 |
+
KAZAKHSTAN = "kazakhstan"
|
| 442 |
+
KENYA = "kenya"
|
| 443 |
+
KIRIBATI = "kiribati"
|
| 444 |
+
NORTH_KOREA = "north_korea"
|
| 445 |
+
SOUTH_KOREA = "south_korea"
|
| 446 |
+
KOSOVO = "kosovo"
|
| 447 |
+
KUWAIT = "kuwait"
|
| 448 |
+
KYRGYZSTAN = "kyrgyzstan"
|
| 449 |
+
LAOS = "laos"
|
| 450 |
+
LATVIA = "latvia"
|
| 451 |
+
LEBANON = "lebanon"
|
| 452 |
+
LESOTHO = "lesotho"
|
| 453 |
+
LIBERIA = "liberia"
|
| 454 |
+
LIBYA = "libya"
|
| 455 |
+
LIECHTENSTEIN = "liechtenstein"
|
| 456 |
+
LITHUANIA = "lithuania"
|
| 457 |
+
LUXEMBOURG = "luxembourg"
|
| 458 |
+
MADAGASCAR = "madagascar"
|
| 459 |
+
MALAWI = "malawi"
|
| 460 |
+
MALAYSIA = "malaysia"
|
| 461 |
+
MALDIVES = "maldives"
|
| 462 |
+
MALI = "mali"
|
| 463 |
+
MALTA = "malta"
|
| 464 |
+
MARSHALL_ISLANDS = "marshall_islands"
|
| 465 |
+
MAURITANIA = "mauritania"
|
| 466 |
+
MAURITIUS = "mauritius"
|
| 467 |
+
MEXICO = "mexico"
|
| 468 |
+
MICRONESIA = "micronesia"
|
| 469 |
+
MOLDOVA = "moldova"
|
| 470 |
+
MONACO = "monaco"
|
| 471 |
+
MONGOLIA = "mongolia"
|
| 472 |
+
MONTENEGRO = "montenegro"
|
| 473 |
+
MOROCCO = "morocco"
|
| 474 |
+
MOZAMBIQUE = "mozambique"
|
| 475 |
+
MYANMAR = "myanmar"
|
| 476 |
+
NAMIBIA = "namibia"
|
| 477 |
+
NAURU = "nauru"
|
| 478 |
+
NEPAL = "nepal"
|
| 479 |
+
NETHERLANDS = "netherlands"
|
| 480 |
+
NEW_ZEALAND = "new_zealand"
|
| 481 |
+
NICARAGUA = "nicaragua"
|
| 482 |
+
NIGER = "niger"
|
| 483 |
+
NIGERIA = "nigeria"
|
| 484 |
+
NIUE = "niue"
|
| 485 |
+
NORTH_MACEDONIA = "north_macedonia"
|
| 486 |
+
NORWAY = "norway"
|
| 487 |
+
OMAN = "oman"
|
| 488 |
+
PAKISTAN = "pakistan"
|
| 489 |
+
PALAU = "palau"
|
| 490 |
+
PALESTINE = "palestine" # UN Observer State
|
| 491 |
+
PANAMA = "panama"
|
| 492 |
+
PAPUA_NEW_GUINEA = "papua_new_guinea"
|
| 493 |
+
PARAGUAY = "paraguay"
|
| 494 |
+
PERU = "peru"
|
| 495 |
+
PHILIPPINES = "philippines"
|
| 496 |
+
POLAND = "poland"
|
| 497 |
+
PORTUGAL = "portugal"
|
| 498 |
+
QATAR = "qatar"
|
| 499 |
+
ROMANIA = "romania"
|
| 500 |
+
RUSSIA = "russia"
|
| 501 |
+
RWANDA = "rwanda"
|
| 502 |
+
SAINT_KITTS_AND_NEVIS = "saint_kitts_and_nevis"
|
| 503 |
+
SAINT_LUCIA = "saint_lucia"
|
| 504 |
+
SAINT_VINCENT_AND_THE_GRENADINES = "saint_vincent_and_the_grenadines"
|
| 505 |
+
SAMOA = "samoa"
|
| 506 |
+
SAN_MARINO = "san_marino"
|
| 507 |
+
SAO_TOME_AND_PRINCIPE = "sao_tome_and_principe"
|
| 508 |
+
SAUDI_ARABIA = "saudi_arabia"
|
| 509 |
+
SENEGAL = "senegal"
|
| 510 |
+
SERBIA = "serbia"
|
| 511 |
+
SEYCHELLES = "seychelles"
|
| 512 |
+
SIERRA_LEONE = "sierra_leone"
|
| 513 |
+
SINGAPORE = "singapore"
|
| 514 |
+
SLOVAKIA = "slovakia"
|
| 515 |
+
SLOVENIA = "slovenia"
|
| 516 |
+
SOLOMON_ISLANDS = "solomon_islands"
|
| 517 |
+
SOMALIA = "somalia"
|
| 518 |
+
SOUTH_AFRICA = "south_africa"
|
| 519 |
+
SOUTH_SUDAN = "south_sudan"
|
| 520 |
+
SPAIN = "spain"
|
| 521 |
+
SRI_LANKA = "sri_lanka"
|
| 522 |
+
SUDAN = "sudan"
|
| 523 |
+
SURINAME = "suriname"
|
| 524 |
+
SWEDEN = "sweden"
|
| 525 |
+
SWITZERLAND = "switzerland"
|
| 526 |
+
SYRIA = "syria"
|
| 527 |
+
TAJIKISTAN = "tajikistan"
|
| 528 |
+
TANZANIA = "tanzania"
|
| 529 |
+
THAILAND = "thailand"
|
| 530 |
+
TIMOR_LESTE = "timor_leste"
|
| 531 |
+
TOGO = "togo"
|
| 532 |
+
TONGA = "tonga"
|
| 533 |
+
TRINIDAD_AND_TOBAGO = "trinidad_and_tobago"
|
| 534 |
+
TUNISIA = "tunisia"
|
| 535 |
+
TURKEY = "turkey"
|
| 536 |
+
TURKMENISTAN = "turkmenistan"
|
| 537 |
+
TUVALU = "tuvalu"
|
| 538 |
+
UGANDA = "uganda"
|
| 539 |
+
UKRAINE = "ukraine"
|
| 540 |
+
UNITED_ARAB_EMIRATES = "united_arab_emirates"
|
| 541 |
+
UNITED_KINGDOM = "united_kingdom"
|
| 542 |
+
UNITED_STATES = "united_states"
|
| 543 |
+
URUGUAY = "uruguay"
|
| 544 |
+
UZBEKISTAN = "uzbekistan"
|
| 545 |
+
VANUATU = "vanuatu"
|
| 546 |
+
VATICAN_CITY = "vatican_city" # UN Observer State
|
| 547 |
+
VENEZUELA = "venezuela"
|
| 548 |
+
VIETNAM = "vietnam"
|
| 549 |
+
YEMEN = "yemen"
|
| 550 |
+
ZAMBIA = "zambia"
|
| 551 |
+
ZIMBABWE = "zimbabwe"
|
| 552 |
+
|
| 553 |
+
# # Dependent Territories and Special Administrative Regions (from ISO 3166-1)
|
| 554 |
+
ALAND_ISLANDS = "aland_islands" # Finland
|
| 555 |
+
AMERICAN_SAMOA = "american_samoa" # United States
|
| 556 |
+
ANGUILLA = "anguilla" # United Kingdom
|
| 557 |
+
ANTARCTICA = "antarctica" # Antarctic Treaty
|
| 558 |
+
ARUBA = "aruba" # Netherlands
|
| 559 |
+
ASCENSION_ISLAND = "ascension_island" # United Kingdom
|
| 560 |
+
BERMUDA = "bermuda" # United Kingdom
|
| 561 |
+
BRITISH_VIRGIN_ISLANDS = "british_virgin_islands" # United Kingdom
|
| 562 |
+
CAYMAN_ISLANDS = "cayman_islands" # United Kingdom
|
| 563 |
+
CHRISTMAS_ISLAND = "christmas_island" # Australia
|
| 564 |
+
COCOS_ISLANDS = "cocos_islands" # Australia
|
| 565 |
+
CURACAO = "curacao" # Netherlands
|
| 566 |
+
FALKLAND_ISLANDS = "falkland_islands" # United Kingdom
|
| 567 |
+
FAROE_ISLANDS = "faroe_islands" # Denmark
|
| 568 |
+
FRENCH_GUIANA = "french_guiana" # France
|
| 569 |
+
FRENCH_POLYNESIA = "french_polynesia" # France
|
| 570 |
+
GIBRALTAR = "gibraltar" # United Kingdom
|
| 571 |
+
GREENLAND = "greenland" # Denmark
|
| 572 |
+
GUADELOUPE = "guadeloupe" # France
|
| 573 |
+
GUAM = "guam" # United States
|
| 574 |
+
GUERNSEY = "guernsey" # United Kingdom
|
| 575 |
+
HONG_KONG = "hong_kong" # China
|
| 576 |
+
ISLE_OF_MAN = "isle_of_man" # United Kingdom
|
| 577 |
+
JERSEY = "jersey" # United Kingdom
|
| 578 |
+
MACAU = "macau" # China
|
| 579 |
+
MARTINIQUE = "martinique" # France
|
| 580 |
+
MAYOTTE = "mayotte" # France
|
| 581 |
+
MONTSERRAT = "montserrat" # United Kingdom
|
| 582 |
+
NEW_CALEDONIA = "new_caledonia" # France
|
| 583 |
+
NORFOLK_ISLAND = "norfolk_island" # Australia
|
| 584 |
+
NORTHERN_MARIANA_ISLANDS = "northern_mariana_islands" # United States
|
| 585 |
+
PITCAIRN_ISLANDS = "pitcairn_islands" # United Kingdom
|
| 586 |
+
PUERTO_RICO = "puerto_rico" # United States
|
| 587 |
+
REUNION = "reunion" # France
|
| 588 |
+
SAINT_BARTHELEMY = "saint_barthelemy" # France
|
| 589 |
+
SAINT_HELENA = "saint_helena" # United Kingdom
|
| 590 |
+
SAINT_MARTIN = "saint_martin" # France
|
| 591 |
+
SAINT_PIERRE_AND_MIQUELON = "saint_pierre_and_miquelon" # France
|
| 592 |
+
SINT_MAARTEN = "sint_maarten" # Netherlands
|
| 593 |
+
SVALBARD_AND_JAN_MAYEN = "svalbard_and_jan_mayen" # Norway
|
| 594 |
+
TAIWAN = "taiwan" # China (disputed)
|
| 595 |
+
TOKELAU = "tokelau" # New Zealand
|
| 596 |
+
TRISTAN_DA_CUNHA = "tristan_da_cunha" # United Kingdom
|
| 597 |
+
TURKS_AND_CAICOS_ISLANDS = "turks_and_caicos_islands" # United Kingdom
|
| 598 |
+
US_VIRGIN_ISLANDS = "us_virgin_islands" # United States
|
| 599 |
+
WALLIS_AND_FUTUNA = "wallis_and_futuna" # France
|
| 600 |
+
WESTERN_SAHARA = "western_sahara" # Disputed
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
class CountryRelevanceSpecial(str, Enum):
|
| 604 |
+
"""
|
| 605 |
+
Special values for country relevance classification from annotation guidelines.
|
| 606 |
+
These are used when content doesn't relate to specific countries.
|
| 607 |
+
"""
|
| 608 |
+
SUPRANATIONAL = "supranational"
|
| 609 |
+
NONE = "none"
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def create_annotation_response_model(
|
| 613 |
+
one_sentence_description_max_length: int = ONE_SENTENCE_DESCRIPTION_MAX_LENGTH,
|
| 614 |
+
) -> Type[BaseModel]:
|
| 615 |
+
"""
|
| 616 |
+
Factory function to create an AnnotationResponse model with configurable max_length
|
| 617 |
+
for the one_sentence_description field.
|
| 618 |
+
|
| 619 |
+
Args:
|
| 620 |
+
one_sentence_description_max_length: Maximum length for the one_sentence_description field.
|
| 621 |
+
Defaults to ONE_SENTENCE_DESCRIPTION_MAX_LENGTH (200).
|
| 622 |
+
|
| 623 |
+
Returns:
|
| 624 |
+
A Pydantic model class with the specified configuration.
|
| 625 |
+
"""
|
| 626 |
+
|
| 627 |
+
class _AnnotationResponse(BaseModel):
|
| 628 |
+
"""
|
| 629 |
+
Property annotation pydantic model for LLM pretraining data.
|
| 630 |
+
It captures all 18 properties as defined in the annotation guidelines for consistently identifying high-value content for language model training.
|
| 631 |
+
"""
|
| 632 |
+
|
| 633 |
+
# Property 1: Content Integrity
|
| 634 |
+
content_integrity: ContentIntegrity = Field(
|
| 635 |
+
...,
|
| 636 |
+
description="Completeness and technical quality of the content itself"
|
| 637 |
+
)
|
| 638 |
+
|
| 639 |
+
# Property 2: Content Ratio
|
| 640 |
+
content_ratio: ContentRatio = Field(
|
| 641 |
+
...,
|
| 642 |
+
description="Ratio of meaningful content vs navigation/UI elements"
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
# Property 3: Content Length
|
| 646 |
+
content_length: ContentLength = Field(
|
| 647 |
+
...,
|
| 648 |
+
description="Amount of substantive content, ignoring navigation and boilerplate"
|
| 649 |
+
)
|
| 650 |
+
|
| 651 |
+
# Property 4: One-Sentence Description
|
| 652 |
+
one_sentence_description: str = Field(
|
| 653 |
+
...,
|
| 654 |
+
description="Ultra-short neutral description of the document. Exactly one sentence. Target 8–15 words (soft max 20). Neutral tone; avoid boilerplate intros and calls to action.",
|
| 655 |
+
max_length=one_sentence_description_max_length,
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
# Property 5: Content Type (multi-select)
|
| 659 |
+
content_type: List[ContentType] = Field(
|
| 660 |
+
...,
|
| 661 |
+
description="Primary purpose and type of content - always return an array (one or more types)",
|
| 662 |
+
min_length=1,
|
| 663 |
+
max_length=5
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
# Property 6: Business Sector (multi-select)
|
| 667 |
+
business_sector: List[BusinessSector] = Field(
|
| 668 |
+
...,
|
| 669 |
+
description="Industry sector(s) - always return an array (one or more sectors)",
|
| 670 |
+
min_length=1,
|
| 671 |
+
max_length=10
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
# Property 7: Technical Content (multi-select)
|
| 675 |
+
technical_content: List[TechnicalContent] = Field(
|
| 676 |
+
...,
|
| 677 |
+
description="Type and intensity of specialized technical knowledge - always return an array (one or more types)",
|
| 678 |
+
min_length=1
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
# Property 8: Information Density
|
| 682 |
+
information_density: InformationDensity = Field(
|
| 683 |
+
...,
|
| 684 |
+
description="Ratio of valuable information to redundancy, padding, and repetition"
|
| 685 |
+
)
|
| 686 |
+
|
| 687 |
+
# Property 9: Content Quality
|
| 688 |
+
content_quality: ContentQuality = Field(
|
| 689 |
+
...,
|
| 690 |
+
description="Overall quality considering writing excellence, substantive value, and presentation"
|
| 691 |
+
)
|
| 692 |
+
|
| 693 |
+
# Property 10: Audience Level
|
| 694 |
+
audience_level: AudienceLevel = Field(
|
| 695 |
+
...,
|
| 696 |
+
description="Intended sophistication level and background knowledge assumptions"
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
# Property 11: Commercial Bias
|
| 700 |
+
commercial_bias: CommercialBias = Field(
|
| 701 |
+
...,
|
| 702 |
+
description="How much commercial interests influence objectivity and informational value"
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
# Property 12: Time Sensitivity
|
| 706 |
+
time_sensitivity: TimeSensitivity = Field(
|
| 707 |
+
...,
|
| 708 |
+
description="How time-sensitive the content is"
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
# Property 13: Content Safety
|
| 712 |
+
content_safety: ContentSafety = Field(
|
| 713 |
+
...,
|
| 714 |
+
description="Presence of inappropriate, harmful, or legally problematic content"
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
# Property 14: Educational Value
|
| 718 |
+
educational_value: EducationalValue = Field(
|
| 719 |
+
...,
|
| 720 |
+
description="Potential for teaching, learning, and knowledge transfer"
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
# Property 15: Reasoning Indicators
|
| 724 |
+
reasoning_indicators: ReasoningIndicators = Field(
|
| 725 |
+
...,
|
| 726 |
+
description="Presence and quality of logical reasoning, analysis, and explanatory content"
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
# Property 16: PII Presence
|
| 730 |
+
pii_presence: PiiPresence = Field(
|
| 731 |
+
...,
|
| 732 |
+
description="Whether the content contains personally identifiable information"
|
| 733 |
+
)
|
| 734 |
+
|
| 735 |
+
# Property 17: Regional Relevance (multi-select)
|
| 736 |
+
regional_relevance: List[RegionalRelevance] = Field(
|
| 737 |
+
...,
|
| 738 |
+
description="Primary regional, cultural, or geopolitical sphere(s) - always return an array (one or multiple regions)",
|
| 739 |
+
min_length=1,
|
| 740 |
+
max_length=3
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
# Property 18: Country Relevance (multi-select)
|
| 744 |
+
country_relevance: List[Union[Country, CountryRelevanceSpecial]] = Field(
|
| 745 |
+
...,
|
| 746 |
+
description="Specific country/countries the content mentions or is relevant for (or special values for supranational/non-country-specific) - always return an array (one or more countries/special values)",
|
| 747 |
+
min_length=1,
|
| 748 |
+
max_length=10
|
| 749 |
+
)
|
| 750 |
+
|
| 751 |
+
model_config = ConfigDict(
|
| 752 |
+
validate_assignment=True,
|
| 753 |
+
extra="forbid", # Don't allow extra fields
|
| 754 |
+
json_schema_extra={
|
| 755 |
+
"example": {
|
| 756 |
+
"content_integrity": "complete",
|
| 757 |
+
"content_ratio": "mostly_content",
|
| 758 |
+
"content_length": "substantial",
|
| 759 |
+
"one_sentence_description": "API reference for payment endpoints and error codes.",
|
| 760 |
+
"content_type": ["analytical", "instructional"],
|
| 761 |
+
"business_sector": ["academic_research", "technology_software"],
|
| 762 |
+
"technical_content": ["scientific", "data_heavy"],
|
| 763 |
+
"information_density": "dense",
|
| 764 |
+
"content_quality": "excellent",
|
| 765 |
+
"audience_level": "expert",
|
| 766 |
+
"commercial_bias": "none",
|
| 767 |
+
"time_sensitivity": "slowly_changing",
|
| 768 |
+
"content_safety": "safe",
|
| 769 |
+
"educational_value": "high",
|
| 770 |
+
"reasoning_indicators": "analytical",
|
| 771 |
+
"pii_presence": "no_pii",
|
| 772 |
+
"regional_relevance": ["european"],
|
| 773 |
+
"country_relevance": ["germany"]
|
| 774 |
+
}
|
| 775 |
+
},
|
| 776 |
+
)
|
| 777 |
+
|
| 778 |
+
return _AnnotationResponse
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
def flatten_model_json_schema(schema: dict) -> dict:
|
| 782 |
+
"""Inline all #/$defs/... references and remove $defs from a Pydantic JSON Schema.
|
| 783 |
+
|
| 784 |
+
- Recursively resolves $ref entries that point into local $defs
|
| 785 |
+
- Preserves sibling constraints next to $ref by shallow-merging into the resolved target
|
| 786 |
+
- Drops any nested $defs occurrences
|
| 787 |
+
"""
|
| 788 |
+
schema_copy = deepcopy(schema)
|
| 789 |
+
defs = schema_copy.pop("$defs", {})
|
| 790 |
+
|
| 791 |
+
def resolve(node):
|
| 792 |
+
if isinstance(node, dict):
|
| 793 |
+
if "$ref" in node:
|
| 794 |
+
ref = node.get("$ref")
|
| 795 |
+
extra = {k: v for k, v in node.items() if k != "$ref" and k != "$defs"}
|
| 796 |
+
if isinstance(ref, str) and ref.startswith("#/$defs/"):
|
| 797 |
+
name = ref.split("/")[-1]
|
| 798 |
+
replacement = deepcopy(defs.get(name, {}))
|
| 799 |
+
resolved_replacement = resolve(replacement)
|
| 800 |
+
resolved_extra = resolve(extra)
|
| 801 |
+
if isinstance(resolved_replacement, dict) and isinstance(resolved_extra, dict):
|
| 802 |
+
return {**resolved_replacement, **resolved_extra}
|
| 803 |
+
return resolved_replacement
|
| 804 |
+
resolved_extra = resolve(extra)
|
| 805 |
+
return {**({"$ref": ref}), **(resolved_extra if isinstance(resolved_extra, dict) else {})}
|
| 806 |
+
return {k: resolve(v) for k, v in node.items() if k != "$defs"}
|
| 807 |
+
if isinstance(node, list):
|
| 808 |
+
return [resolve(item) for item in node]
|
| 809 |
+
return node
|
| 810 |
+
|
| 811 |
+
return resolve(schema_copy)
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
def get_annotation_response_schema(
|
| 815 |
+
use_country_enum: bool = True,
|
| 816 |
+
flatten: bool = True,
|
| 817 |
+
as_string: bool = False,
|
| 818 |
+
minify: bool = True,
|
| 819 |
+
one_sentence_description_max_length=ONE_SENTENCE_DESCRIPTION_MAX_LENGTH,
|
| 820 |
+
compact_whitespace: bool = True
|
| 821 |
+
) -> Union[dict, str]:
|
| 822 |
+
"""
|
| 823 |
+
Build the JSON Schema for `AnnotationResponse` with an option to avoid large country enums.
|
| 824 |
+
|
| 825 |
+
- If `use_country_enum` is True (default), the schema uses enum definitions for
|
| 826 |
+
`country_relevance` items as generated by Pydantic.
|
| 827 |
+
- If `use_country_enum` is False, `country_relevance` becomes a list of strings
|
| 828 |
+
(no enum) while the property's description still contains the full list of
|
| 829 |
+
valid values. This avoids very large enum blocks for APIs that do not support them.
|
| 830 |
+
- If `flatten` is True (default), inline all local $defs via `flatten_model_json_schema`.
|
| 831 |
+
- If `as_string` is True, return the schema as a JSON string. When `as_string`
|
| 832 |
+
is True and `minify` is True (default), emit compact JSON with no extra
|
| 833 |
+
whitespace to reduce token usage. If `minify` is False, pretty-print with indentation.
|
| 834 |
+
- If `compact_whitespace` is True (default), adds x-guidance directive to enforce
|
| 835 |
+
compact JSON output with no tabs, newlines, or extra whitespace between tokens.
|
| 836 |
+
This prevents models from generating whitespace-heavy malformed JSON.
|
| 837 |
+
"""
|
| 838 |
+
schema = create_annotation_response_model(one_sentence_description_max_length).model_json_schema()
|
| 839 |
+
|
| 840 |
+
# Add x-guidance directive for llguidance to enforce compact JSON (no tabs/newlines/whitespace)
|
| 841 |
+
if compact_whitespace:
|
| 842 |
+
schema["x-guidance"] = {"whitespace_flexible": False}
|
| 843 |
+
|
| 844 |
+
if not use_country_enum:
|
| 845 |
+
# Construct the list of valid values from the enums but do not emit them as enum types
|
| 846 |
+
valid_values = [e.value for e in Country] + [e.value for e in CountryRelevanceSpecial]
|
| 847 |
+
|
| 848 |
+
country_prop = schema.get("properties", {}).get("country_relevance")
|
| 849 |
+
if isinstance(country_prop, dict):
|
| 850 |
+
existing_description = country_prop.get("description", "")
|
| 851 |
+
|
| 852 |
+
# Ensure the property is an array of strings without duplicating the long values list
|
| 853 |
+
country_prop["type"] = "array"
|
| 854 |
+
country_prop["items"] = {"type": "string"}
|
| 855 |
+
|
| 856 |
+
# Retain minItems and other constraints already present on the property
|
| 857 |
+
|
| 858 |
+
# Put the full list of valid values only in the property description (not in items)
|
| 859 |
+
values_text = f" Valid values: {', '.join(valid_values)}"
|
| 860 |
+
if existing_description and "Valid values:" not in existing_description:
|
| 861 |
+
country_prop["description"] = existing_description.rstrip() + values_text
|
| 862 |
+
elif not existing_description:
|
| 863 |
+
country_prop["description"] = values_text.strip()
|
| 864 |
+
|
| 865 |
+
if flatten:
|
| 866 |
+
schema = flatten_model_json_schema(schema)
|
| 867 |
+
|
| 868 |
+
if as_string:
|
| 869 |
+
if minify:
|
| 870 |
+
return json.dumps(schema, separators=(",", ":"), ensure_ascii=False)
|
| 871 |
+
return json.dumps(schema, indent=2, ensure_ascii=False)
|
| 872 |
+
|
| 873 |
+
return schema
|
| 874 |
+
|
| 875 |
+
|
| 876 |
+
# Default AnnotationResponse model with default max_length
|
| 877 |
+
AnnotationResponse = create_annotation_response_model()
|
| 878 |
+
|
| 879 |
+
|
| 880 |
+
TRUNCATION_TAG = "<truncated_content>"
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
def truncate_content(content: str, max_content_chars: int) -> str:
|
| 884 |
+
if max_content_chars > 0 and len(content) > max_content_chars:
|
| 885 |
+
return f"{content[:max_content_chars]}\n{TRUNCATION_TAG}"
|
| 886 |
+
return content
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
with open(Path("property_descriptions.md"), "r") as f:
|
| 890 |
+
property_descriptions = f.read()
|
| 891 |
+
|
| 892 |
+
|
| 893 |
+
def create_messages(document_text: str, max_content_chars: int = 50_000) -> list[dict]:
|
| 894 |
+
document_text = truncate_content(document_text, max_content_chars)
|
| 895 |
+
user_prompt = USER_PROMPT.format(content=document_text)
|
| 896 |
+
|
| 897 |
+
messages = [
|
| 898 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 899 |
+
{"role": "user", "content": user_prompt},
|
| 900 |
+
]
|
| 901 |
+
return messages
|
| 902 |
+
|
| 903 |
+
|
| 904 |
+
schema_str = get_annotation_response_schema(as_string=True, one_sentence_description_max_length=150)
|
| 905 |
+
annotator_system_prompt = ANNOTATOR_SYSTEM_PROMPT.format(json_schema=schema_str, property_descriptions=property_descriptions)
|
| 906 |
+
|
| 907 |
+
|
| 908 |
+
def create_annotator_messages(document_text: str, max_content_chars: int = 50_000) -> list[dict]:
|
| 909 |
+
document_text = truncate_content(document_text, max_content_chars)
|
| 910 |
+
user_prompt = ANNOTATOR_USER_PROMPT.format(content=document_text)
|
| 911 |
+
|
| 912 |
+
messages = [
|
| 913 |
+
{"role": "system", "content": annotator_system_prompt},
|
| 914 |
+
{"role": "user", "content": user_prompt},
|
| 915 |
+
]
|
| 916 |
+
return messages
|
property_descriptions.md
ADDED
|
@@ -0,0 +1,1182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Detailed Property Descriptions & Annotation Guidelines
|
| 2 |
+
|
| 3 |
+
### Core Content Properties
|
| 4 |
+
|
| 5 |
+
#### 1. Content Integrity
|
| 6 |
+
|
| 7 |
+
**What we're measuring**: Completeness and technical quality of the content itself, regardless of navigation ratio.
|
| 8 |
+
|
| 9 |
+
##### Values & Criteria:
|
| 10 |
+
|
| 11 |
+
**`complete`** - Full, intact content as intended
|
| 12 |
+
- Content appears complete with proper beginning, middle, and end
|
| 13 |
+
- All essential elements present (introduction, body, conclusion where appropriate)
|
| 14 |
+
- No obvious truncation or missing sections
|
| 15 |
+
- Example: Complete articles, full tutorials, intact documents
|
| 16 |
+
|
| 17 |
+
**`mostly_complete`** - Minor elements missing but core content intact
|
| 18 |
+
- Core content is complete but some secondary elements may be missing
|
| 19 |
+
- Minor truncation that doesn't affect main message
|
| 20 |
+
- Example: Article with truncated comments, missing sidebar content, partial author bio
|
| 21 |
+
|
| 22 |
+
**`fragment`** - Incomplete content, missing significant portions
|
| 23 |
+
- Missing introduction, conclusion, or substantial middle sections
|
| 24 |
+
- Truncated mid-sentence or mid-paragraph
|
| 25 |
+
- Content feels incomplete or cut off
|
| 26 |
+
- Example: Search result snippets, article excerpts, broken crawls, partial downloads
|
| 27 |
+
|
| 28 |
+
**`severely_degraded`** - Broken, unreadable, or corrupted content
|
| 29 |
+
- Encoding errors, scrambled text, missing characters
|
| 30 |
+
- Severely malformed HTML rendering as gibberish
|
| 31 |
+
- Technical corruption making content unreadable
|
| 32 |
+
- Example: �&$^%*@# characters, completely broken formatting, corrupted files
|
| 33 |
+
|
| 34 |
+
##### Key Decision Points:
|
| 35 |
+
- **Content completeness**: Does the content feel like a complete unit of information?
|
| 36 |
+
- **Technical integrity**: Is the content technically readable and properly formatted?
|
| 37 |
+
- **Fragment vs. complete**: Independent of navigation - is the actual content complete?
|
| 38 |
+
- **Degraded vs. fragment**: Degraded has technical issues; fragment is just incomplete
|
| 39 |
+
|
| 40 |
+
**Note**: Documents may end with the special tag `<content_truncated>`, indicating upstream length-based truncation due to processing constraints. Do not penalize Content Integrity due to this truncation signal; assess integrity based on the visible content's coherence and technical readability, ignoring the artificial cutoff.
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
#### 2. Content Ratio
|
| 45 |
+
|
| 46 |
+
**What we're measuring**: How much of the document is actual content vs. navigation, UI elements, and structural markup.
|
| 47 |
+
|
| 48 |
+
##### Values & Criteria:
|
| 49 |
+
|
| 50 |
+
**`complete_content`** - 90-100% meaningful content
|
| 51 |
+
- Full articles, papers, tutorials with minimal navigation
|
| 52 |
+
- Clean text with proper paragraphs and structure
|
| 53 |
+
- Example: A Wikipedia article, academic paper, complete blog post
|
| 54 |
+
|
| 55 |
+
**`mostly_content`** - 70-89% meaningful content
|
| 56 |
+
- Complete documents with some navigation elements (header, footer, sidebar)
|
| 57 |
+
- Minor UI elements that don't disrupt reading
|
| 58 |
+
- Example: News articles with standard website navigation
|
| 59 |
+
|
| 60 |
+
**`mixed_content`** - 40-69% meaningful content
|
| 61 |
+
- Significant navigation mixed throughout content
|
| 62 |
+
- Multiple sidebars, ads, or UI elements interrupting text
|
| 63 |
+
- Example: E-commerce product pages with reviews mixed with purchase options
|
| 64 |
+
|
| 65 |
+
**`mostly_navigation`** - 10-39% meaningful content
|
| 66 |
+
- Predominantly menus, links, headers, footers
|
| 67 |
+
- Content overwhelmed by structural elements
|
| 68 |
+
- Example: Site maps, navigation pages, heavily UI-focused pages
|
| 69 |
+
|
| 70 |
+
**`minimal_content`** - 0-9% meaningful content
|
| 71 |
+
- Almost entirely navigation, UI elements, or structural markup
|
| 72 |
+
- Very little readable content present
|
| 73 |
+
- Example: Empty pages, pure navigation menus, error pages with minimal text
|
| 74 |
+
|
| 75 |
+
##### Key Decision Points:
|
| 76 |
+
- Focus on the **ratio of readable text to navigation/UI elements**
|
| 77 |
+
- **Count only substantive content**, ignore boilerplate and structural elements
|
| 78 |
+
- **Mixed vs. mostly_navigation**: Can you read it as coherent content despite distractions?
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
#### 3. Content Length
|
| 83 |
+
|
| 84 |
+
**What we're measuring**: Amount of substantive content, ignoring navigation and boilerplate.
|
| 85 |
+
|
| 86 |
+
##### Values & Criteria:
|
| 87 |
+
|
| 88 |
+
**`substantial`** - 2,000+ words of meaningful content
|
| 89 |
+
- Long-form, comprehensive content that provides in-depth coverage of a topic
|
| 90 |
+
- Typically includes detailed analysis, multiple sections or chapters, extensive research, or thorough exploration of complex subjects
|
| 91 |
+
- Examples: White papers, research reports, e-books, long-form journalism
|
| 92 |
+
|
| 93 |
+
**`moderate`** - 500–2,000 words of meaningful content
|
| 94 |
+
- Standard-length content that offers meaningful coverage while remaining focused and digestible
|
| 95 |
+
- Balances depth with accessibility; provides enough detail to be informative without overwhelming readers
|
| 96 |
+
- Examples: Typical blog posts, news articles, product reviews, how-to guides
|
| 97 |
+
|
| 98 |
+
**`brief`** - 100–500 words of meaningful content
|
| 99 |
+
- Short, focused content that delivers key information quickly and efficiently
|
| 100 |
+
- Gets straight to the point while still providing value and context
|
| 101 |
+
- Examples: News briefs, product descriptions, FAQs, short blog posts
|
| 102 |
+
|
| 103 |
+
**`minimal`** - Under 100 words of meaningful content
|
| 104 |
+
- Very short content that provides only essential information or serves as a quick reference
|
| 105 |
+
- Designed for rapid consumption or specific micro-purposes
|
| 106 |
+
- Examples: Social media posts, announcements, abstracts, snippets, navigation pages
|
| 107 |
+
|
| 108 |
+
##### Measurement Tips:
|
| 109 |
+
- **Count only readable content of value**: include article body and substantive headings/captions; exclude headers/footers, menus/sidebars, related links, share/consent UI, pagination, ads, and boilerplate.
|
| 110 |
+
- **Focus on substantive information**, not filler words
|
| 111 |
+
- **Complete thoughts matter more than exact word counts**
|
| 112 |
+
- **Contextual adjustment**: Thresholds are guidelines and can be adjusted based on specific use cases and typical content. Academic contexts may shift ranges upward, while social media contexts may shift them downward.
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
### Content Classification
|
| 117 |
+
|
| 118 |
+
#### 4. One-Sentence Description
|
| 119 |
+
|
| 120 |
+
**What we're looking for**: A very short, neutral description of what the document contains.
|
| 121 |
+
|
| 122 |
+
##### Field:
|
| 123 |
+
|
| 124 |
+
**`one_sentence_description`**
|
| 125 |
+
- Ultra-short neutral description of the document
|
| 126 |
+
- Exactly one sentence
|
| 127 |
+
- Target length: <100 characters
|
| 128 |
+
- Focus on the main topic and, if useful, the document’s function
|
| 129 |
+
- Examples of functions: tutorial, policy, news report, product page, navigation page
|
| 130 |
+
- Neutral, descriptive tone (no hype or marketing language)
|
| 131 |
+
|
| 132 |
+
##### To Avoid:
|
| 133 |
+
- Boilerplate intros: "This document...", "This article...", "In this guide..."
|
| 134 |
+
- Calls to action: "Learn how to...", "Discover...", "Find out..."
|
| 135 |
+
- User-facing phrasing: "You will learn...", "How do I..."
|
| 136 |
+
- Non-essential details (dates, numbers) unless central to the topic
|
| 137 |
+
|
| 138 |
+
##### Examples:
|
| 139 |
+
- "Beginner tutorial on React hooks and basic state management."
|
| 140 |
+
- "News report on European Central Bank interest rate decisions."
|
| 141 |
+
- "Internal policy for customer data retention and deletion."
|
| 142 |
+
- "API reference for payment processing endpoints and error codes."
|
| 143 |
+
- "Research paper analyzing housing price trends in major US cities."
|
| 144 |
+
- "FAQ answering common questions about employee parental leave."
|
| 145 |
+
- "Opinion essay arguing for stricter international climate change legislation."
|
| 146 |
+
|
| 147 |
+
##### Examples for low-quality or problematic documents (still annotate):
|
| 148 |
+
- "Fragment of article discussing proposed changes to European data privacy laws."
|
| 149 |
+
- "Keyword-stuffed promotional page about cheap car insurance quotes."
|
| 150 |
+
- "Website navigation page listing links to product categories and help pages."
|
| 151 |
+
- "Error page explaining that the requested resource could not be found."
|
| 152 |
+
- "Affiliate landing page promoting multiple online casino bonus offers."
|
| 153 |
+
- "Corrupted text with no identifiable topic or meaningful content."
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
#### 5. Content Type
|
| 157 |
+
|
| 158 |
+
**What we're measuring**: The functional structure and purpose of content.
|
| 159 |
+
|
| 160 |
+
**Multi-type content**: Content can be assigned multiple type labels if it genuinely serves multiple purposes. Choose ALL applicable types rather than forcing a single primary choice. Always output an array for this property, even if only one type applies.
|
| 161 |
+
|
| 162 |
+
##### Values & Criteria:
|
| 163 |
+
|
| 164 |
+
**`analytical`** - In-depth analysis, research, and critical examination
|
| 165 |
+
- Provides detailed analysis or research on a topic
|
| 166 |
+
- Develops arguments, evaluates evidence, or presents findings
|
| 167 |
+
- Example: Research analysis, investigative reports, academic articles, expert commentary
|
| 168 |
+
|
| 169 |
+
**`instructional`** - Teaching and how-to content
|
| 170 |
+
- Explicitly teaches skills, concepts, or procedures
|
| 171 |
+
- Step-by-step guidance or educational explanations
|
| 172 |
+
- Example: Tutorials, how-to guides, educational content, training materials
|
| 173 |
+
|
| 174 |
+
**`reference`** - Lookup materials, definitions, specifications
|
| 175 |
+
- Designed for looking up specific information rather than reading through
|
| 176 |
+
- Often organized alphabetically, categorically, or as lists
|
| 177 |
+
- Example: Dictionaries, encyclopedias, API references, product catalogs
|
| 178 |
+
|
| 179 |
+
**`procedural`** - Step-by-step processes and procedures
|
| 180 |
+
- Sequential instructions or workflows
|
| 181 |
+
- Process documentation with clear steps
|
| 182 |
+
- Example: Recipes, installation guides, standard operating procedures, workflows
|
| 183 |
+
|
| 184 |
+
**`qa_structured`** - Structured question-answer content
|
| 185 |
+
- Formal Q&A format with clear questions and answers
|
| 186 |
+
- Often expert responses to specific questions
|
| 187 |
+
- Example: Stack Overflow, FAQ sections, structured Q&A sites
|
| 188 |
+
|
| 189 |
+
**`conversational`** - Multi-party or turn-based dialogues (humans, bots, or both)
|
| 190 |
+
- Casual or structured conversations between two or more participants
|
| 191 |
+
- May include human–AI chats, forum threads, or comment chains
|
| 192 |
+
- Example: Reddit threads, forum discussions, support chats, assistant chat logs
|
| 193 |
+
|
| 194 |
+
**`creative`** - Entertainment, artistic, fictional content
|
| 195 |
+
- Primary purpose is entertainment or artistic expression
|
| 196 |
+
- Not primarily informational or instructional
|
| 197 |
+
- Example: Short stories, poems, movie reviews, game content, fiction
|
| 198 |
+
|
| 199 |
+
**`transactional`** - Commercial, shopping, service-oriented
|
| 200 |
+
- Primary purpose is to facilitate a transaction or service
|
| 201 |
+
- Focuses on products, services, or business processes
|
| 202 |
+
- Example: Product listings, service descriptions, checkout pages
|
| 203 |
+
|
| 204 |
+
**`boilerplate`** - Legal, policy, standard template text
|
| 205 |
+
- Standard legal or policy language
|
| 206 |
+
- Often repeated across multiple sites with minimal variation
|
| 207 |
+
- Example: Terms of service, privacy policies, disclaimers, cookie banners, standard notices
|
| 208 |
+
|
| 209 |
+
**`news_report`** - Straight reporting of events with minimal analysis
|
| 210 |
+
- Describes events or facts in a neutral, descriptive tone
|
| 211 |
+
- Time-bound news, updates, or reports
|
| 212 |
+
- Example: Wire-service news articles, breaking-news updates
|
| 213 |
+
|
| 214 |
+
**`opinion_editorial`** - Persuasive/opinionated commentary or editorials
|
| 215 |
+
- Expresses a stance or argument; aims to persuade
|
| 216 |
+
- May cite evidence but prioritizes viewpoint
|
| 217 |
+
- Example: Op-eds, opinion columns, personal essays with clear stance
|
| 218 |
+
|
| 219 |
+
**`review_critique`** - Evaluative reviews of products, media, or services
|
| 220 |
+
- Provides judgments, ratings, or critiques
|
| 221 |
+
- May include pros/cons, scoring systems
|
| 222 |
+
- Example: Product reviews, film/book critiques, app store reviews (long-form)
|
| 223 |
+
|
| 224 |
+
**`technical_documentation`** - Manuals, API docs, developer guides, READMEs
|
| 225 |
+
- Primary goal is to instruct usage of software/hardware/APIs
|
| 226 |
+
- Includes reference sections, examples, parameters, version notes
|
| 227 |
+
- Example: API reference, library README, user manual
|
| 228 |
+
|
| 229 |
+
**`specification_standard`** - Normative standards and formal specifications
|
| 230 |
+
- Defines requirements, must/shall language, compliance criteria
|
| 231 |
+
- Maintained by standards bodies or authoritative groups
|
| 232 |
+
- Example: RFCs, ISO standards, formal protocol specs
|
| 233 |
+
|
| 234 |
+
**`legal_document`** - Statutes, case law, contracts, regulatory texts
|
| 235 |
+
- Binding or authoritative legal content
|
| 236 |
+
- Formal legal language and structure
|
| 237 |
+
- Example: Court opinions, legislation, contracts, regulatory rules
|
| 238 |
+
|
| 239 |
+
**`press_release`** - Organization-issued announcements and PR materials
|
| 240 |
+
- Promotional announcements framed as information
|
| 241 |
+
- Quotes from executives, product/service announcements
|
| 242 |
+
- Example: Company press releases, launch announcements
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
**`structured_data`** - Tables, datasets, indices, catalogs with minimal prose
|
| 246 |
+
- Predominantly tabular/listed data meant for lookup
|
| 247 |
+
- Minimal narrative or explanatory text
|
| 248 |
+
- Example: Product catalogs, schedules, statistical tables
|
| 249 |
+
|
| 250 |
+
**`source_code`** - Code listings as primary content
|
| 251 |
+
- Dominant content is program source code or scripts
|
| 252 |
+
- May include lightweight comments or snippets without narrative
|
| 253 |
+
- Example: Code files, gist-like pages, competitive programming solutions
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
##### Multi-Type Examples:
|
| 257 |
+
- **Tutorial that analyzes different approaches** → `["instructional", "analytical"]`
|
| 258 |
+
- **Educational reference manual** → `["instructional", "reference"]`
|
| 259 |
+
- **Research paper with step-by-step methodology** → `["analytical", "procedural"]`
|
| 260 |
+
- **Q&A site with analytical responses** → `["qa_structured", "analytical"]`
|
| 261 |
+
- **API guide with examples** → `["technical_documentation", "reference", "instructional"]`
|
| 262 |
+
- **RFC with rationale** → `["specification_standard", "analytical"]`
|
| 263 |
+
- **Film review with interview snippets** → `["review_critique", "conversational"]`
|
| 264 |
+
- **Helpdesk chat with an AI** → `["conversational", "transactional"]`
|
| 265 |
+
- **Breaking news explainer** → `["news_report", "explanatory"]`
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
#### 6. Business Sector
|
| 270 |
+
|
| 271 |
+
**What we're measuring**: Business sector(s) or industry domain(s) for training sector-specific LLMs.
|
| 272 |
+
|
| 273 |
+
**Multi-sector content**: Content can be assigned multiple sector labels if it genuinely spans multiple industries. Choose ALL applicable sectors rather than forcing a single primary choice or using "other". Always output an array for this property, even if only one sector applies.
|
| 274 |
+
|
| 275 |
+
#### Values & Criteria:
|
| 276 |
+
|
| 277 |
+
**`academic_research`** - Scholarly and research content
|
| 278 |
+
- Peer-reviewed publications, academic papers
|
| 279 |
+
- University-affiliated research and scholarship
|
| 280 |
+
- Formal academic discourse and methodology
|
| 281 |
+
- Example: Journal articles, conference papers, academic books, dissertations
|
| 282 |
+
|
| 283 |
+
**`education_sector`** - Educational institutions and pedagogy
|
| 284 |
+
- K-12 education, higher education administration
|
| 285 |
+
- Educational technology, curriculum development
|
| 286 |
+
- Teaching methodologies and educational policy
|
| 287 |
+
- Example: School curricula, educational policy papers, teaching resources, edtech content
|
| 288 |
+
|
| 289 |
+
**`technology_software`** - Software and information technology
|
| 290 |
+
- Software development, programming, IT services
|
| 291 |
+
- Digital products, platforms, and technology companies
|
| 292 |
+
- Computer science and software engineering
|
| 293 |
+
- Example: Software documentation, tech company content, programming guides, IT industry analysis
|
| 294 |
+
|
| 295 |
+
**`hardware_electronics`** - Hardware devices and electronics industry
|
| 296 |
+
- Semiconductors, consumer electronics, embedded systems, hardware design
|
| 297 |
+
- Electronics manufacturing and supply chains
|
| 298 |
+
- Example: Chip design docs, hardware datasheets, device manuals
|
| 299 |
+
|
| 300 |
+
**`healthcare_medical`** - Healthcare and medical sector
|
| 301 |
+
- Medical research, clinical practice, healthcare delivery
|
| 302 |
+
- Hospitals, medical devices, healthcare policy
|
| 303 |
+
- Public health and wellness
|
| 304 |
+
- Example: Medical journals, clinical guidelines, healthcare administration, wellness content
|
| 305 |
+
|
| 306 |
+
**`pharmaceutical_biotech`** - Pharmaceutical and biotechnology
|
| 307 |
+
- Drug development, clinical trials, biotech research
|
| 308 |
+
- Pharmaceutical industry, biotechnology companies
|
| 309 |
+
- Life sciences and molecular biology applications
|
| 310 |
+
- Example: Drug research papers, clinical trial reports, biotech industry analysis
|
| 311 |
+
|
| 312 |
+
**`financial_services`** - Banking and financial services
|
| 313 |
+
- Banking, investment, insurance, financial planning
|
| 314 |
+
- Financial markets, fintech, payment systems
|
| 315 |
+
- Asset management and financial advisory
|
| 316 |
+
- Example: Financial analysis, banking documentation, investment guides
|
| 317 |
+
|
| 318 |
+
**`legal_services`** - Legal sector and jurisprudence
|
| 319 |
+
- Law firms, legal practice, court systems
|
| 320 |
+
- Legal education, regulatory compliance
|
| 321 |
+
- Litigation, contracts, legal advisory
|
| 322 |
+
- Example: Legal briefs, court opinions, legal analysis, compliance guides
|
| 323 |
+
|
| 324 |
+
**`government_public`** - Government and public administration
|
| 325 |
+
- Government agencies, public policy, civic services
|
| 326 |
+
- Regulatory bodies, public administration
|
| 327 |
+
- Political institutions and governance
|
| 328 |
+
- Example: Government reports, policy documents, regulatory filings, civic information
|
| 329 |
+
|
| 330 |
+
**`manufacturing_industrial`** - Manufacturing and heavy industry
|
| 331 |
+
- Industrial production, manufacturing processes
|
| 332 |
+
- Supply chain, logistics, industrial equipment
|
| 333 |
+
- Factory operations and industrial engineering
|
| 334 |
+
- Example: Manufacturing specs, industrial reports, supply chain analysis, production guides
|
| 335 |
+
|
| 336 |
+
**`mining_resources`** - Mining and natural resources
|
| 337 |
+
- Exploration, extraction, and processing of minerals and resources
|
| 338 |
+
- Resource markets and operations (metals, rare earths)
|
| 339 |
+
- Example: Mining reports, resource exploration docs, commodity operations
|
| 340 |
+
|
| 341 |
+
**`chemicals_materials`** - Chemicals and advanced materials
|
| 342 |
+
- Petrochemicals, specialty chemicals, polymers, composites, advanced materials
|
| 343 |
+
- Safety data sheets (SDS), process chemistry, materials science
|
| 344 |
+
- Example: Material datasheets, REACH documentation, chemical process guides
|
| 345 |
+
|
| 346 |
+
**`energy_utilities`** - Energy and utilities sector
|
| 347 |
+
- Power generation, renewable energy, oil and gas
|
| 348 |
+
- Electric utilities, water services, waste management
|
| 349 |
+
- Energy infrastructure and grid management
|
| 350 |
+
- Example: Energy industry reports, utility regulations, renewable energy research
|
| 351 |
+
|
| 352 |
+
**`retail_commerce`** - Retail and e-commerce
|
| 353 |
+
- Retail operations, e-commerce platforms
|
| 354 |
+
- Consumer goods distribution, merchandising
|
| 355 |
+
- Retail technology and customer experience
|
| 356 |
+
- Example: Retail industry analysis, e-commerce guides, merchandising strategies
|
| 357 |
+
|
| 358 |
+
**`wholesale_distribution`** - Wholesale trade and distribution
|
| 359 |
+
- B2B wholesale, distributors, procurement, inventory and fulfillment
|
| 360 |
+
- Supply relationships between manufacturers and retailers
|
| 361 |
+
- Example: Distributor catalogs, wholesale operations, procurement guides
|
| 362 |
+
|
| 363 |
+
**`real_estate_construction`** - Real estate and construction
|
| 364 |
+
- Property development, construction industry
|
| 365 |
+
- Real estate markets, property management
|
| 366 |
+
- Architecture and building services
|
| 367 |
+
- Example: Real estate analysis, construction specifications, property guides
|
| 368 |
+
|
| 369 |
+
**`transportation_logistics`** - Transportation and logistics
|
| 370 |
+
- Airlines, shipping, freight, public transit
|
| 371 |
+
- Logistics operations, supply chain transportation
|
| 372 |
+
- Vehicle fleet management, transportation infrastructure
|
| 373 |
+
- Example: Logistics guides, transportation planning, shipping documentation
|
| 374 |
+
|
| 375 |
+
**`travel_aviation`** - Travel industry and commercial aviation
|
| 376 |
+
- Airlines, airports, OTA platforms, hospitality travel operations
|
| 377 |
+
- Route planning, airline commercial, loyalty, IATA regulations
|
| 378 |
+
- Example: Airline scheduling, fare rules, OTA partner docs
|
| 379 |
+
|
| 380 |
+
**`automotive_industry`** - Automotive manufacturing and services
|
| 381 |
+
- Vehicle manufacturers, automotive suppliers
|
| 382 |
+
- Automotive technology, electric vehicles
|
| 383 |
+
- Dealerships and automotive services
|
| 384 |
+
- Example: Automotive engineering docs, vehicle technology papers, industry analysis
|
| 385 |
+
|
| 386 |
+
**`telecommunications`** - Telecommunications industry
|
| 387 |
+
- Telecom operators, network infrastructure
|
| 388 |
+
- Mobile services, broadband, satellite communications
|
| 389 |
+
- Telecommunications equipment and technology
|
| 390 |
+
- Example: Telecom industry reports, network specifications, 5G technology papers
|
| 391 |
+
|
| 392 |
+
**`media_entertainment`** - Media and entertainment industry
|
| 393 |
+
- Film, television, music, gaming industries
|
| 394 |
+
- Publishing, news media, content creation
|
| 395 |
+
- Streaming services and digital media
|
| 396 |
+
- Example: Entertainment industry analysis, media studies, content strategy
|
| 397 |
+
|
| 398 |
+
**`gaming_industry`** - Video games and interactive entertainment
|
| 399 |
+
- Game development, studios, engines, esports, live ops
|
| 400 |
+
- Monetization models, community management, platform ecosystems
|
| 401 |
+
- Example: Patch notes, game design docs, esports operations
|
| 402 |
+
|
| 403 |
+
**`gambling_betting`** - Gambling, betting, and online casinos
|
| 404 |
+
- Sportsbooks, casino games, lotteries, poker rooms
|
| 405 |
+
- Affiliate landing pages, bonus/promotions, tipster content
|
| 406 |
+
- Often high commercial bias and promotional framing
|
| 407 |
+
|
| 408 |
+
**`advertising_marketing`** - Advertising, marketing, and PR
|
| 409 |
+
- Brand strategy, campaign planning, performance marketing, martech
|
| 410 |
+
- Agencies, in-house marketing, PR communications
|
| 411 |
+
- Example: Campaign briefs, media plans, PR strategies
|
| 412 |
+
|
| 413 |
+
**`hospitality_tourism`** - Hospitality and tourism sector
|
| 414 |
+
- Hotels, restaurants, travel services
|
| 415 |
+
- Tourism industry, destination management
|
| 416 |
+
- Event planning and hospitality services
|
| 417 |
+
- Example: Tourism studies, hospitality management, travel industry reports
|
| 418 |
+
|
| 419 |
+
**`food_beverage_hospitality`** - Food & beverage and restaurant operations
|
| 420 |
+
- Restaurant ops, menu engineering, supply chain, QSR/fast casual
|
| 421 |
+
- Food safety, compliance, procurement for F&B
|
| 422 |
+
- Example: Restaurant training manuals, HACCP docs, vendor specs
|
| 423 |
+
|
| 424 |
+
**`agriculture_food`** - Agriculture and food production
|
| 425 |
+
- Farming, agricultural technology, food processing
|
| 426 |
+
- Agricultural supply chain, food safety
|
| 427 |
+
- Agribusiness and agricultural policy
|
| 428 |
+
- Example: Agricultural research, food industry reports, farming guides
|
| 429 |
+
|
| 430 |
+
**`environmental_services`** - Environmental and sustainability services
|
| 431 |
+
- Environmental consulting, ESG reporting, sustainability programs
|
| 432 |
+
- Waste management services, remediation, impact assessments
|
| 433 |
+
- Example: ESG reports, environmental impact assessments, sustainability frameworks
|
| 434 |
+
|
| 435 |
+
**`aerospace_defense`** - Aerospace and defense industry
|
| 436 |
+
- Aircraft manufacturing, space technology
|
| 437 |
+
- Defense contractors, military systems
|
| 438 |
+
- Aviation and space exploration
|
| 439 |
+
- Example: Aerospace engineering papers, defense industry analysis, aviation guides
|
| 440 |
+
|
| 441 |
+
**`insurance_industry`** - Insurance sector
|
| 442 |
+
- Life, health, property, and casualty insurance
|
| 443 |
+
- Reinsurance, actuarial science, risk assessment
|
| 444 |
+
- Insurance technology and underwriting
|
| 445 |
+
- Example: Actuarial studies, insurance policy analysis, risk management guides
|
| 446 |
+
|
| 447 |
+
**`nonprofit_ngo`** - Nonprofit and NGO sector
|
| 448 |
+
- Charitable organizations, international development
|
| 449 |
+
- Social services, humanitarian organizations
|
| 450 |
+
- Foundations and philanthropic institutions
|
| 451 |
+
- Example: NGO reports, nonprofit management, development studies
|
| 452 |
+
|
| 453 |
+
**`consulting_professional`** - Professional services and consulting
|
| 454 |
+
- Management consulting, accounting firms
|
| 455 |
+
- Business advisory, professional services firms
|
| 456 |
+
- Corporate strategy and business transformation
|
| 457 |
+
- Example: Consulting reports, professional services guides, business strategy papers
|
| 458 |
+
|
| 459 |
+
**`human_resources`** - HR and people operations
|
| 460 |
+
- Talent acquisition, compensation & benefits, performance management, L&D
|
| 461 |
+
- HR tech, workforce planning, organizational development
|
| 462 |
+
- Example: HR policy docs, job frameworks, talent strategy
|
| 463 |
+
|
| 464 |
+
**`security_cyber`** - Security and cybersecurity
|
| 465 |
+
- Information security, threat intelligence, risk management, compliance (e.g., SOC2)
|
| 466 |
+
- Physical security operations and incident response
|
| 467 |
+
- Example: Security guidelines, incident playbooks, vulnerability reports
|
| 468 |
+
|
| 469 |
+
**`consumer_goods`** - Consumer products and CPG
|
| 470 |
+
- Fast-moving consumer goods, household products
|
| 471 |
+
- Personal care, food and beverage brands
|
| 472 |
+
- Consumer product development and marketing
|
| 473 |
+
- Example: CPG industry analysis, product development docs, consumer research
|
| 474 |
+
|
| 475 |
+
**`general_interest`** - General audience content
|
| 476 |
+
- Content for broad audiences without sector focus
|
| 477 |
+
- General knowledge and miscellaneous topics
|
| 478 |
+
- Cross-sector or sector-agnostic content
|
| 479 |
+
- Example: General magazines, broad interest content, lifestyle articles
|
| 480 |
+
|
| 481 |
+
**`other`** - Highly specialized or unclassifiable
|
| 482 |
+
- Highly specialized niches not covered by existing sectors
|
| 483 |
+
- Content with genuinely unclear sector classification
|
| 484 |
+
- Unique content types that don't map to any defined sector
|
| 485 |
+
- Example: Highly specialized technical niches, unique content formats
|
| 486 |
+
|
| 487 |
+
##### Multi-Sector Examples:
|
| 488 |
+
- **Medical device regulations** → `healthcare_medical` + `pharmaceutical_biotech` + `government_public`
|
| 489 |
+
- **Fintech software documentation** → `financial_services` + `technology_software`
|
| 490 |
+
- **Agricultural biotechnology research** → `agriculture_food` + `pharmaceutical_biotech`
|
| 491 |
+
|
| 492 |
+
---
|
| 493 |
+
|
| 494 |
+
#### 7. Technical Content
|
| 495 |
+
|
| 496 |
+
**What we're measuring**: Type and intensity of specialized technical knowledge.
|
| 497 |
+
|
| 498 |
+
**Multi-technical content**: Content can be assigned multiple technical content labels if it genuinely combines multiple technical domains. Choose ALL applicable technical types rather than forcing a single primary choice. Always output an array for this property, even if only one technical type applies.
|
| 499 |
+
|
| 500 |
+
##### Values & Criteria:
|
| 501 |
+
|
| 502 |
+
**`code_heavy`** - Significant programming content
|
| 503 |
+
- Multiple code examples, algorithms, or implementations
|
| 504 |
+
- Technical programming concepts and methodologies
|
| 505 |
+
- Software development focus
|
| 506 |
+
- Example: Programming tutorials, API documentation, software guides
|
| 507 |
+
|
| 508 |
+
**`math_heavy`** - Substantial mathematical content
|
| 509 |
+
- Mathematical equations, proofs, or statistical analysis
|
| 510 |
+
- Quantitative analysis and mathematical reasoning
|
| 511 |
+
- Mathematical concepts and methodologies
|
| 512 |
+
- Example: Mathematical papers, statistical analysis, quantitative research
|
| 513 |
+
|
| 514 |
+
**`scientific`** - Research and scientific methodology content
|
| 515 |
+
- Scientific research findings, experimental data
|
| 516 |
+
- Scientific methodology and analysis
|
| 517 |
+
- Peer-reviewed research content
|
| 518 |
+
- Example: Research papers, scientific studies, experimental reports
|
| 519 |
+
|
| 520 |
+
**`data_heavy`** - Substantial datasets, tables, and data analysis
|
| 521 |
+
- Contains significant data tables, charts, or datasets
|
| 522 |
+
- Focus on data interpretation and analysis
|
| 523 |
+
- Statistical content with data presentations
|
| 524 |
+
- Example: Research data, statistical reports, data analysis, survey results
|
| 525 |
+
|
| 526 |
+
**`engineering`** - Engineering and applied technical content
|
| 527 |
+
- Engineering design, systems, and applied technical solutions
|
| 528 |
+
- Technical specifications for physical systems
|
| 529 |
+
- Non-software engineering disciplines
|
| 530 |
+
- Example: Mechanical engineering, civil engineering, technical specifications, design documents
|
| 531 |
+
|
| 532 |
+
**`basic_technical`** - Some technical elements but not dominant
|
| 533 |
+
- Light technical content mixed with general explanations
|
| 534 |
+
- Technical concepts explained for general audience
|
| 535 |
+
- Example: Technology articles for general audience, basic technical explanations
|
| 536 |
+
|
| 537 |
+
**`non_technical`** - No significant technical content
|
| 538 |
+
- General audience content without specialized technical knowledge
|
| 539 |
+
- No programming, mathematical, engineering, or scientific focus
|
| 540 |
+
- Example: General articles, humanities content, basic informational content
|
| 541 |
+
|
| 542 |
+
##### Multi-Technical Examples:
|
| 543 |
+
- **Data science tutorial with code examples** → `["code_heavy", "math_heavy", "data_heavy"]`
|
| 544 |
+
- **Engineering research with statistical analysis** → `["engineering", "scientific", "data_heavy"]`
|
| 545 |
+
- **Computational biology paper** → `["code_heavy", "scientific"]`
|
| 546 |
+
|
| 547 |
+
---
|
| 548 |
+
|
| 549 |
+
### Quality and Value Assessment
|
| 550 |
+
|
| 551 |
+
#### 8. Content Quality
|
| 552 |
+
|
| 553 |
+
**What we're measuring**: Overall quality of content considering writing excellence, substantive value, and presentation quality regardless of authorship origin.
|
| 554 |
+
|
| 555 |
+
#### Values & Criteria:
|
| 556 |
+
|
| 557 |
+
**`excellent`** - Outstanding quality across all dimensions
|
| 558 |
+
- Sophisticated writing with varied sentence structures and engaging style
|
| 559 |
+
- Rich, appropriate vocabulary with error-free grammar and punctuation
|
| 560 |
+
- High substantive value with clear insights or information
|
| 561 |
+
- Professional presentation and formatting
|
| 562 |
+
- Natural flow and logical organization
|
| 563 |
+
- Example: High-quality publications, expert analyses, polished educational content, well-crafted professional documents
|
| 564 |
+
|
| 565 |
+
**`good`** - High quality with minor imperfections
|
| 566 |
+
- Grammatically correct with proper sentence structure
|
| 567 |
+
- Appropriate vocabulary and tone for content type
|
| 568 |
+
- Solid substantive value and clear information
|
| 569 |
+
- Good organization and readable flow
|
| 570 |
+
- Only occasional minor issues (1-2 typos per section)
|
| 571 |
+
- Example: Quality journalism, professional websites, well-written blog posts, solid educational materials
|
| 572 |
+
|
| 573 |
+
**`adequate`** - Acceptable quality for most purposes
|
| 574 |
+
- Generally clear and understandable writing
|
| 575 |
+
- Some grammatical errors but meaning remains clear
|
| 576 |
+
- Reasonable substantive value though may lack depth
|
| 577 |
+
- Basic organization and structure present
|
| 578 |
+
- Minor formatting or presentation issues
|
| 579 |
+
- Example: Casual blogs, user reviews, basic informational content, simple guides
|
| 580 |
+
|
| 581 |
+
**`poor`** - Significant quality issues impacting utility
|
| 582 |
+
- Multiple errors affecting comprehension or credibility
|
| 583 |
+
- Unclear expression, confusing organization, or awkward phrasing
|
| 584 |
+
- Limited substantive value or questionable information
|
| 585 |
+
- Major formatting problems or unprofessional presentation
|
| 586 |
+
- Difficult to extract reliable information
|
| 587 |
+
- Example: Low-quality web content, poorly edited materials, confusing instructions
|
| 588 |
+
|
| 589 |
+
**`unacceptable`** - Quality too low for productive use
|
| 590 |
+
- Severely impaired communication with major errors
|
| 591 |
+
- Incoherent, nonsensical, or corrupted content
|
| 592 |
+
- No reliable substantive value
|
| 593 |
+
- Broken formatting or technical corruption
|
| 594 |
+
- Cannot determine intended meaning or extract useful information
|
| 595 |
+
- Example: Corrupted text, severe translation errors, spam content, SEO content, completely broken formatting
|
| 596 |
+
|
| 597 |
+
##### Quality Assessment Guidelines:
|
| 598 |
+
- **Comprehension**: Can the intended message be clearly understood?
|
| 599 |
+
- **Substantive value**: Does the content provide useful information or insights?
|
| 600 |
+
- **Technical presentation**: Is the content properly formatted and readable?
|
| 601 |
+
- **Error impact**: Do errors significantly impede understanding or credibility?
|
| 602 |
+
- **Professional standards**: Does the content meet basic standards for its intended purpose?
|
| 603 |
+
|
| 604 |
+
**Language-Specific Quality Indicators:**
|
| 605 |
+
- For non-Latin scripts (Arabic, Chinese, Japanese): Check for proper character encoding
|
| 606 |
+
- For agglutinative languages (Turkish, Finnish): Adjust expectations for word count/density
|
| 607 |
+
- For languages with different formality levels (Japanese, Korean): Assess appropriate register
|
| 608 |
+
- Mixed-language documents: Evaluate code-switching quality and appropriateness
|
| 609 |
+
---
|
| 610 |
+
|
| 611 |
+
#### 9. Information Density
|
| 612 |
+
|
| 613 |
+
**What we're measuring**: Ratio of valuable information to redundancy, padding, and repetition.
|
| 614 |
+
|
| 615 |
+
##### Values & Criteria:
|
| 616 |
+
|
| 617 |
+
**`dense`** - Efficient, information-packed content
|
| 618 |
+
- Every sentence adds new information or insight
|
| 619 |
+
- Minimal redundancy or unnecessary elaboration
|
| 620 |
+
- Little to no repetition of the same concepts
|
| 621 |
+
- Example: Technical specifications, concise academic writing, quality reference material
|
| 622 |
+
|
| 623 |
+
**`adequate`** - Good information content with reasonable elaboration
|
| 624 |
+
- Most content adds value with some acceptable elaboration
|
| 625 |
+
- Minimal repetition within the document
|
| 626 |
+
- Good balance of information and explanation
|
| 627 |
+
- Example: Well-written articles, good tutorials with examples
|
| 628 |
+
|
| 629 |
+
**`moderate`** - Mixed substantive content with noticeable padding
|
| 630 |
+
- Some valuable information mixed with unnecessary elaboration
|
| 631 |
+
- Noticeable repetition of key points for emphasis
|
| 632 |
+
- Some sections feel padded or verbose
|
| 633 |
+
- Example: Blog posts with some fluff, articles with repetitive conclusions
|
| 634 |
+
|
| 635 |
+
**`thin`** - Low information content with significant problems
|
| 636 |
+
- Much content doesn't add new information
|
| 637 |
+
- High internal repetition and excessive redundancy
|
| 638 |
+
- Significant padding to reach desired length
|
| 639 |
+
- Example: SEO-optimized content, poorly edited writing
|
| 640 |
+
|
| 641 |
+
**`empty`** - Dominated by repetition and meaningless content
|
| 642 |
+
- Minimal actual information value
|
| 643 |
+
- Dominated by repetition and copy-paste artifacts
|
| 644 |
+
- Same ideas repeated multiple times without development
|
| 645 |
+
- Example: Spam content, template-filled pages, keyword-stuffed articles
|
| 646 |
+
|
| 647 |
+
##### Common Repetition Patterns to Watch For:
|
| 648 |
+
- **Same phrases repeated throughout** (especially in SEO content)
|
| 649 |
+
- **Identical paragraphs** or sections (copy-paste errors)
|
| 650 |
+
- **Circular reasoning** (saying the same thing in different ways)
|
| 651 |
+
- **Template artifacts** (repeated boilerplate mixed with content)
|
| 652 |
+
|
| 653 |
+
---
|
| 654 |
+
|
| 655 |
+
#### 10. Educational Value
|
| 656 |
+
|
| 657 |
+
**What we're measuring**: Potential for teaching, learning, and knowledge transfer.
|
| 658 |
+
|
| 659 |
+
##### Values & Criteria:
|
| 660 |
+
|
| 661 |
+
**`high`** - Clear instructional design and learning objectives
|
| 662 |
+
- Explicitly teaches concepts or skills
|
| 663 |
+
- Progressive skill building from basic to advanced
|
| 664 |
+
- Clear learning objectives and outcomes
|
| 665 |
+
- Comprehensive explanations with examples
|
| 666 |
+
- Example: Quality tutorials, textbooks, structured courses, educational guides
|
| 667 |
+
|
| 668 |
+
**`moderate`** - Good instructional value with some learning potential
|
| 669 |
+
- Some instructional elements present
|
| 670 |
+
- Explanations help build understanding
|
| 671 |
+
- Transferable knowledge to other contexts
|
| 672 |
+
- Good examples or illustrations
|
| 673 |
+
- Example: How-to articles, explanatory content, informative guides
|
| 674 |
+
|
| 675 |
+
**`basic`** - Limited educational content
|
| 676 |
+
- Some explanations but not systematically instructional
|
| 677 |
+
- Basic explanations of concepts
|
| 678 |
+
- Limited learning potential or skill building
|
| 679 |
+
- Example: Basic explanations, simple informational content
|
| 680 |
+
|
| 681 |
+
**`minimal`** - Little educational value
|
| 682 |
+
- Primarily informational rather than instructional
|
| 683 |
+
- No clear learning objectives or skill building
|
| 684 |
+
- Entertainment or commercial focus
|
| 685 |
+
- Example: Entertainment content, basic news, commercial content
|
| 686 |
+
|
| 687 |
+
**`none`** - No educational content
|
| 688 |
+
- No instructional value or learning potential
|
| 689 |
+
- Purely transactional, entertainment, or administrative
|
| 690 |
+
- No knowledge transfer potential
|
| 691 |
+
- Example: Pure entertainment, transactions, legal boilerplate
|
| 692 |
+
|
| 693 |
+
##### Disambiguation tips
|
| 694 |
+
- Explanatory vs Educational: explanations alone ≠ educational design; require intent to teach plus scaffolding for Basic+
|
| 695 |
+
- Reference docs: typically Minimal; promote to Basic/Moderate when guided “how-to” segments or curated examples exist
|
| 696 |
+
- Reviews/op-eds: None/Minimal unless they include actionable how-to guidance designed for learning
|
| 697 |
+
|
| 698 |
+
##### Automation heuristics
|
| 699 |
+
- Keywords: Objectives/Outcomes, Lesson, Exercise/Quiz, Homework, Assessment, Syllabus, Module, Unit, Learning Goals
|
| 700 |
+
- Structure: numbered steps + prerequisites/requirements → Basic; add practice tasks/solutions → Moderate; syllabus/modules/assessments → High
|
| 701 |
+
- Signals of non-edu mix: heavy CTAs/ads or product pitches → cap at Minimal unless clear instructional scaffolding
|
| 702 |
+
|
| 703 |
+
##### Quick decision tree
|
| 704 |
+
- Are there explicit learning goals or a syllabus? → High
|
| 705 |
+
- Else, are there step-by-step instructions with examples/exercises? → Moderate
|
| 706 |
+
- Else, are there explanatory sections intended to teach basics? → Basic
|
| 707 |
+
- Else, is there any minor instructional element? → Minimal
|
| 708 |
+
- Otherwise → None
|
| 709 |
+
|
| 710 |
+
##### Borderline examples
|
| 711 |
+
- API reference with examples but no guidance → Minimal to Basic (depending on clarity/examples)
|
| 712 |
+
- Blog post explaining concept with analogies and one example → Basic
|
| 713 |
+
- Tutorial with tasks, checkpoints, and solutions → High
|
| 714 |
+
- Product documentation with “Getting Started” and “How-To” flows → Moderate
|
| 715 |
+
|
| 716 |
+
##### Educational Indicators:
|
| 717 |
+
- **Learning objectives**: Clear goals for what reader should learn
|
| 718 |
+
- **Skill progression**: Builds from basic to advanced concepts
|
| 719 |
+
- **Examples and practice**: Provides concrete examples or exercises
|
| 720 |
+
- **Knowledge transfer**: Concepts applicable beyond immediate context
|
| 721 |
+
|
| 722 |
+
---
|
| 723 |
+
|
| 724 |
+
#### 11. Reasoning Indicators
|
| 725 |
+
|
| 726 |
+
**What we're measuring**: Presence and quality of logical reasoning, analysis, and explanatory content.
|
| 727 |
+
|
| 728 |
+
##### Values & Criteria:
|
| 729 |
+
|
| 730 |
+
**`analytical`** - Complex reasoning and systematic analysis
|
| 731 |
+
- Multi-step arguments with logical progression
|
| 732 |
+
- Cause-effect analysis and systematic thinking
|
| 733 |
+
- Considers multiple perspectives or variables
|
| 734 |
+
- Draws conclusions from evidence and reasoning
|
| 735 |
+
- Example: Research analysis, complex problem-solving, systematic evaluations
|
| 736 |
+
|
| 737 |
+
**`explanatory`** - Clear explanations with logical flow
|
| 738 |
+
- Explains how or why things work
|
| 739 |
+
- Shows cause-effect relationships clearly
|
| 740 |
+
- Educational reasoning that builds understanding
|
| 741 |
+
- Logical connections between concepts
|
| 742 |
+
- Example: Good tutorials, educational content, how-to explanations
|
| 743 |
+
|
| 744 |
+
**`basic_reasoning`** - Simple logical connections
|
| 745 |
+
- Some logical connections between ideas
|
| 746 |
+
- Basic explanations of concepts or processes
|
| 747 |
+
- Elementary analytical thinking
|
| 748 |
+
- Simple cause-effect relationships
|
| 749 |
+
- Example: Basic explanations, simple arguments, elementary analysis
|
| 750 |
+
|
| 751 |
+
**`minimal`** - Limited reasoning, mostly descriptive
|
| 752 |
+
- Primarily describes what rather than why or how
|
| 753 |
+
- Few logical connections between ideas
|
| 754 |
+
- Mostly factual statements without analysis
|
| 755 |
+
- Little explanatory content
|
| 756 |
+
- Example: Basic descriptions, simple factual content, minimal analysis
|
| 757 |
+
|
| 758 |
+
**`none`** - No clear reasoning present
|
| 759 |
+
- Purely descriptive content
|
| 760 |
+
- Simple factual listing without connections
|
| 761 |
+
- Narrative content without analysis
|
| 762 |
+
- No logical argumentation or explanation
|
| 763 |
+
- Example: Simple lists, basic narratives, pure description
|
| 764 |
+
|
| 765 |
+
##### Thinking-trace signals (what to look for)
|
| 766 |
+
- Stepwise structure: numbered steps in proofs/derivations/solutions; “First… therefore… hence… so…”
|
| 767 |
+
- Hypothesis and test: assumptions, intermediate results, counterexamples, sanity checks
|
| 768 |
+
- Tool- or method-calls: named algorithms, theorems, lemmas, or procedures invoked and justified
|
| 769 |
+
- Error analysis or reflection: “we tried X, failed because Y, so we…”, “limitations,” “edge cases”
|
| 770 |
+
- Intermediate artifacts: scratch calculations, partial code reasoning, sub-problems and sub-claims
|
| 771 |
+
|
| 772 |
+
##### Disambiguation rules
|
| 773 |
+
- Explanatory vs Analytical: explanations tell how; analytical shows multi-step inference with evidence and intermediate claims
|
| 774 |
+
- Worked example vs Mere answer: worked examples expose steps and justification; mere answers without steps are not reasoning-rich
|
| 775 |
+
- Procedural vs Reasoning: procedural lists actions; reasoning links actions via logic, evidence, or constraints
|
| 776 |
+
|
| 777 |
+
##### Automation heuristics
|
| 778 |
+
- Lexical cues: because, therefore, thus, hence, suppose/assume, we conclude, by induction, lemma/theorem/proof, O(n), hypothesis, counterexample
|
| 779 |
+
- Structure cues: presence of proof blocks, derivations (e.g., “Proof.”, “QED”, TeX environments), multi-step numeric calculations
|
| 780 |
+
- Program reasoning: code comments like “// invariant”, “// complexity”, pre/post-conditions, test reasoning
|
| 781 |
+
- Thresholding: count reasoning cues per 1k tokens; with ≥2 structural cues or ≥5 lexical cues → at least explanatory; proofs/derivations → analytical
|
| 782 |
+
|
| 783 |
+
##### Quick decision tree
|
| 784 |
+
- Is there a proof/derivation or multi-step argument with intermediate claims? → analytical
|
| 785 |
+
- Else, does it explain why/how with cause-effect and logical links? → explanatory
|
| 786 |
+
- Else, are there simple logical connections or one-step justifications? → basic_reasoning
|
| 787 |
+
- Else, does it mostly describe without connecting ideas? → minimal/none
|
| 788 |
+
|
| 789 |
+
##### Borderline examples
|
| 790 |
+
- Answer-only solutions (final numeric result without steps) → minimal
|
| 791 |
+
- Step-by-step math solution with intermediate equations → analytical
|
| 792 |
+
- “How it works” article connecting 2–3 causal steps without data → explanatory
|
| 793 |
+
- Troubleshooting log with attempts and justifications → analytical if causal chain is explicit; otherwise explanatory
|
| 794 |
+
|
| 795 |
+
##### Key Reasoning Patterns to Identify:
|
| 796 |
+
- **Cause-effect**: "Because X, therefore Y"
|
| 797 |
+
- **Problem-solution**: Identifies problems and proposes solutions
|
| 798 |
+
- **Comparison**: Analyzes similarities and differences
|
| 799 |
+
- **Logical progression**: Ideas build on previous ideas
|
| 800 |
+
- **Evidence-based conclusions**: Draws conclusions from presented evidence
|
| 801 |
+
|
| 802 |
+
---
|
| 803 |
+
|
| 804 |
+
### Audience and Purpose
|
| 805 |
+
|
| 806 |
+
#### 12. Audience Level
|
| 807 |
+
|
| 808 |
+
**What we're measuring**: Intended sophistication level and background knowledge assumptions of the target audience.
|
| 809 |
+
|
| 810 |
+
##### Values & Criteria:
|
| 811 |
+
|
| 812 |
+
**`expert`** - Highly specialized professional/academic content
|
| 813 |
+
- Assumes deep domain expertise and advanced training
|
| 814 |
+
- Uses technical terminology without explanation
|
| 815 |
+
- Content for practitioners actively working in specialized fields
|
| 816 |
+
- Example: Climate modeling methodology in Nature Climate Change, research papers, technical specifications, expert-to-expert communications
|
| 817 |
+
|
| 818 |
+
**`advanced`** - Educated adult audience with analytical skills
|
| 819 |
+
- Assumes higher education and critical thinking ability
|
| 820 |
+
- Explains specialized concepts but uses sophisticated language
|
| 821 |
+
- Intellectually challenging but accessible to educated generalists
|
| 822 |
+
- Example: Complex climate change analysis in The Atlantic, quality journalism, policy analysis, advanced general interest content
|
| 823 |
+
|
| 824 |
+
**`general`** - General adult audience
|
| 825 |
+
- Accessible to most educated adults without specialized background
|
| 826 |
+
- Explains technical concepts when introduced
|
| 827 |
+
- Uses clear language while maintaining intellectual substance
|
| 828 |
+
- Example: Quality journalism, general interest articles, accessible explanations of complex topics
|
| 829 |
+
|
| 830 |
+
**`beginner`** - Introductory level with minimal prerequisites
|
| 831 |
+
- Explains basic concepts and terminology
|
| 832 |
+
- Builds up from fundamental principles
|
| 833 |
+
- Assumes minimal prior knowledge of the subject area
|
| 834 |
+
- Example: Introductory tutorials, beginner guides, basic explanations, getting-started content
|
| 835 |
+
|
| 836 |
+
**`youth`** - Targeted at teenagers and young adults (ages 13-19)
|
| 837 |
+
- Age-appropriate complexity with contemporary cultural references
|
| 838 |
+
- Sophisticated enough for developing critical thinking but accessible
|
| 839 |
+
- May address topics relevant to adolescent experiences and concerns
|
| 840 |
+
- Example: High school educational content, young adult literature, teen-focused explanations, college prep materials
|
| 841 |
+
|
| 842 |
+
**`children`** - Designed specifically for children
|
| 843 |
+
- Simple language and concepts appropriate for young readers
|
| 844 |
+
- Educational content designed for elementary/middle school levels
|
| 845 |
+
- Age-appropriate topics and complexity
|
| 846 |
+
- Example: Children's educational content, elementary school materials, simple explanations for young learners
|
| 847 |
+
|
| 848 |
+
##### Assessment Guidelines:
|
| 849 |
+
- **Professional context**: Is this content designed for workplace use vs. general learning?
|
| 850 |
+
- **Terminology density**: How much specialized vocabulary is used without explanation?
|
| 851 |
+
- **Concept complexity**: How sophisticated are the ideas and their development?
|
| 852 |
+
- **Background assumptions**: What education level and domain knowledge does the author assume?
|
| 853 |
+
|
| 854 |
+
**Cross-Linguistic Considerations:**
|
| 855 |
+
- Expert terminology density varies by language (German allows more compound terms)
|
| 856 |
+
- Formality markers differ across cultures
|
| 857 |
+
- Educational level assumptions vary by country's education system
|
| 858 |
+
- Age-appropriate content differs across cultures
|
| 859 |
+
|
| 860 |
+
---
|
| 861 |
+
|
| 862 |
+
#### 13. Commercial Bias
|
| 863 |
+
|
| 864 |
+
**What we're measuring**: How much commercial interests influence the objectivity and informational value of content.
|
| 865 |
+
|
| 866 |
+
##### Values & Criteria:
|
| 867 |
+
|
| 868 |
+
**`none`** - No commercial influence detected
|
| 869 |
+
- Objective, informational presentation
|
| 870 |
+
- No promotional language or commercial agenda
|
| 871 |
+
- Focus purely on informing or educating
|
| 872 |
+
- Example: Academic papers, objective journalism, educational content
|
| 873 |
+
|
| 874 |
+
**`minimal`** - Slight commercial context but maintains objectivity
|
| 875 |
+
- May mention products/services but in informational context
|
| 876 |
+
- Maintains balanced, objective tone
|
| 877 |
+
- Commercial mentions serve informational purpose
|
| 878 |
+
- Example: Product reviews with balanced analysis, informational articles mentioning relevant products
|
| 879 |
+
|
| 880 |
+
**`moderate`** - Some commercial influence on content
|
| 881 |
+
- Mix of informational and promotional content
|
| 882 |
+
- Some promotional language but still provides useful information
|
| 883 |
+
- Commercial interests somewhat visible but not dominant
|
| 884 |
+
- Example: Company blogs with useful information, sponsored content with actual value
|
| 885 |
+
|
| 886 |
+
**`heavy`** - Strong commercial bias throughout
|
| 887 |
+
- Primarily promotional with some informational elements
|
| 888 |
+
- Heavy use of marketing language and persuasive techniques
|
| 889 |
+
- Clear commercial agenda affects content objectivity
|
| 890 |
+
- Example: Marketing articles disguised as information, heavily biased product comparisons
|
| 891 |
+
|
| 892 |
+
**`pure_marketing`** - Entirely commercial/promotional content
|
| 893 |
+
- No genuine informational value beyond promotion
|
| 894 |
+
- Pure marketing copy or advertising material
|
| 895 |
+
- Designed solely to drive sales or conversions
|
| 896 |
+
- Example: Sales pages, pure advertising copy, promotional brochures
|
| 897 |
+
|
| 898 |
+
##### Key Indicators:
|
| 899 |
+
- **Language tone**: Objective vs. promotional language
|
| 900 |
+
- **Primary purpose**: Inform vs. persuade/sell
|
| 901 |
+
- **Balance**: Are alternatives/drawbacks mentioned?
|
| 902 |
+
- **Call-to-action**: Subtle information vs. obvious sales pitch
|
| 903 |
+
|
| 904 |
+
---
|
| 905 |
+
|
| 906 |
+
#### 14. Time-Sensitivity
|
| 907 |
+
|
| 908 |
+
**What we're measuring**: How time-sensitive the content is - whether its value degrades over time or remains stable.
|
| 909 |
+
|
| 910 |
+
##### Values & Criteria:
|
| 911 |
+
|
| 912 |
+
**`evergreen`** - Content remains valuable indefinitely
|
| 913 |
+
- Fundamental concepts, principles, theories
|
| 914 |
+
- Historical information and established facts
|
| 915 |
+
- Skills and techniques that don't change
|
| 916 |
+
- Reference materials with lasting value
|
| 917 |
+
- Example: Mathematical proofs, language grammar guides, classical literature analysis, basic cooking techniques
|
| 918 |
+
|
| 919 |
+
**`slowly_changing`** - Content remains valuable for years
|
| 920 |
+
- Best practices that evolve slowly
|
| 921 |
+
- Technical content that updates every few years
|
| 922 |
+
- Cultural and social topics with gradual change
|
| 923 |
+
- Example: Programming language tutorials, academic textbooks, industry standards, educational curricula
|
| 924 |
+
|
| 925 |
+
**`regularly_updating`** - Content valuable for months to a year
|
| 926 |
+
- Industry trends and market analysis
|
| 927 |
+
- Technology reviews and comparisons
|
| 928 |
+
- Policy discussions and current research
|
| 929 |
+
- Example: Software framework guides, business strategies, product reviews, research summaries
|
| 930 |
+
|
| 931 |
+
**`time_sensitive`** - Content value degrades quickly
|
| 932 |
+
- News and current events
|
| 933 |
+
- Time-bound information (prices, schedules, availability)
|
| 934 |
+
- Temporary situations or short-term trends
|
| 935 |
+
- Real-time data and statistics
|
| 936 |
+
- Example: Stock prices, weather reports, breaking news, event announcements, sales/promotions
|
| 937 |
+
|
| 938 |
+
##### Key Decision Points:
|
| 939 |
+
- **Core question**: If someone reads this in 2 years, will it still be valuable?
|
| 940 |
+
- **Update frequency**: How often does this type of information typically change?
|
| 941 |
+
- **Temporal references**: Does the content heavily reference "now," "recently," "currently"?
|
| 942 |
+
- **Subject matter stability**: Is this about unchanging principles or evolving situations?
|
| 943 |
+
|
| 944 |
+
---
|
| 945 |
+
|
| 946 |
+
### Safety and Compliance
|
| 947 |
+
|
| 948 |
+
#### 15. Content Safety
|
| 949 |
+
|
| 950 |
+
**What we're measuring**: Presence of inappropriate, harmful, or legally problematic content.
|
| 951 |
+
|
| 952 |
+
##### Values & Criteria:
|
| 953 |
+
|
| 954 |
+
**`safe`** - Appropriate for all contexts
|
| 955 |
+
- No concerning content of any type
|
| 956 |
+
- Professional, appropriate language throughout
|
| 957 |
+
- Suitable for general audiences including workplace settings
|
| 958 |
+
|
| 959 |
+
**`mild_concerns`** - Minor issues that don't constitute major problems
|
| 960 |
+
- Occasional mild profanity in context
|
| 961 |
+
- Brief mentions of sensitive topics handled appropriately
|
| 962 |
+
- Minor concerns that don't affect overall suitability
|
| 963 |
+
- Example: Historical discussions of sensitive topics, professional content with mild language
|
| 964 |
+
|
| 965 |
+
**`nsfw`** - Not safe for work or general audiences
|
| 966 |
+
- Explicit sexual content or graphic descriptions
|
| 967 |
+
- Adult themes requiring content warnings
|
| 968 |
+
- Graphic violence or disturbing imagery descriptions
|
| 969 |
+
- Example: Adult content, graphic medical descriptions, explicit violence
|
| 970 |
+
|
| 971 |
+
**`harmful`** - Potentially harmful content requiring careful handling
|
| 972 |
+
- Content promoting dangerous activities or self-harm
|
| 973 |
+
- Hate speech targeting individuals or groups
|
| 974 |
+
- Violent content glorifying harm to others
|
| 975 |
+
- Example: Self-harm content, hate speech, dangerous "how-to" guides
|
| 976 |
+
|
| 977 |
+
**`illegal`** - Illegal content requiring immediate rejection
|
| 978 |
+
- Content promoting clearly illegal activities
|
| 979 |
+
- Material that violates laws in major jurisdictions
|
| 980 |
+
- Example: Terrorist content, child exploitation
|
| 981 |
+
|
| 982 |
+
##### Safety Assessment Guidelines:
|
| 983 |
+
- **Context matters**: Medical/educational discussions of sensitive topics may be appropriate
|
| 984 |
+
- **Intent matters**: Discussing harmful topics for educational purposes vs. promoting them
|
| 985 |
+
- **Audience consideration**: Content appropriate for experts may not be safe for general audiences
|
| 986 |
+
|
| 987 |
+
---
|
| 988 |
+
|
| 989 |
+
#### 16. PII Presence
|
| 990 |
+
|
| 991 |
+
**What we're measuring**: Whether the content contains personally identifiable information that could identify private individuals.
|
| 992 |
+
|
| 993 |
+
##### Values & Criteria:
|
| 994 |
+
|
| 995 |
+
**`no_pii`** - No personal information detected
|
| 996 |
+
- No names of private individuals
|
| 997 |
+
- No contact information (emails, phones, addresses)
|
| 998 |
+
- No identification numbers
|
| 999 |
+
- Public figures and officials mentioned by name are acceptable
|
| 1000 |
+
- Example: News articles about politicians, technical documentation, general information
|
| 1001 |
+
|
| 1002 |
+
**`contains_pii`** - Contains potentially identifiable information
|
| 1003 |
+
- Names of private individuals (non-public figures)
|
| 1004 |
+
- Email addresses, phone numbers, physical addresses
|
| 1005 |
+
- ID numbers (SSN, passport, driver's license, employee IDs)
|
| 1006 |
+
- Medical information about identifiable individuals
|
| 1007 |
+
- Financial account information
|
| 1008 |
+
- Example: Personal blogs with full names, leaked databases, medical case studies with identifying info
|
| 1009 |
+
|
| 1010 |
+
##### Key Decision Points:
|
| 1011 |
+
- **Public vs. Private figures**: Politicians, celebrities, CEOs = public (no PII flag); private citizens = PII
|
| 1012 |
+
- **Context matters**: Academic paper authors and their institutional emails = typically no PII; personal emails in forums = PII
|
| 1013 |
+
- **Aggregated vs. Individual**: Statistical data = no PII; individual records = PII
|
| 1014 |
+
|
| 1015 |
+
---
|
| 1016 |
+
|
| 1017 |
+
### Geographic Relevance
|
| 1018 |
+
|
| 1019 |
+
#### 17. Regional Relevance
|
| 1020 |
+
|
| 1021 |
+
**What we're measuring**: Primary regional, cultural, or geopolitical sphere(s) that the content relates to, regardless of language used.
|
| 1022 |
+
|
| 1023 |
+
**Multi-regional content**: Content can be assigned multiple regional labels if it genuinely spans multiple regions. Choose ALL applicable regions rather than forcing a single primary choice. Always output an array for this property, even if only one region applies.
|
| 1024 |
+
|
| 1025 |
+
##### Values & Criteria:
|
| 1026 |
+
|
| 1027 |
+
**`european`** - European context (EU and broader Europe)
|
| 1028 |
+
- Content about European countries, EU policies, or pan-European topics
|
| 1029 |
+
- European cultural perspectives, social systems, or business practices
|
| 1030 |
+
- References to European cities, institutions, companies, or regulations
|
| 1031 |
+
- Includes: EU member states, UK, Switzerland, Norway, Balkans, etc.
|
| 1032 |
+
- Example: GDPR compliance, European Parliament elections, Schengen area travel, European football leagues
|
| 1033 |
+
|
| 1034 |
+
**`north_american`** - North American context
|
| 1035 |
+
- Content about US, Canada, or Mexico
|
| 1036 |
+
- North American cultural perspectives, USMCA/NAFTA region topics
|
| 1037 |
+
- References to North American institutions, companies, or issues
|
| 1038 |
+
- Example: FDA regulations, Silicon Valley tech, NHL, US constitutional law, Canadian healthcare
|
| 1039 |
+
|
| 1040 |
+
**`east_asian`** - East Asian context
|
| 1041 |
+
- Content about China, Japan, Korea (North/South), Taiwan, Mongolia
|
| 1042 |
+
- East Asian cultural perspectives, Confucian-influenced societies
|
| 1043 |
+
- References to East Asian economic models, companies, or social systems
|
| 1044 |
+
- Example: Gaokao exams, K-pop, Shenzhen tech hub, Japanese work culture, Taiwan semiconductor industry
|
| 1045 |
+
|
| 1046 |
+
**`south_asian`** - South Asian context
|
| 1047 |
+
- Content about India, Pakistan, Bangladesh, Sri Lanka, Nepal, Bhutan, Afghanistan, Maldives
|
| 1048 |
+
- South Asian cultural perspectives, subcontinental issues
|
| 1049 |
+
- References to South Asian institutions, economies, or social structures
|
| 1050 |
+
- Example: IIT entrance exams, Bollywood, cricket leagues, monsoon impacts, caste system discussions
|
| 1051 |
+
|
| 1052 |
+
**`southeast_asian`** - Southeast Asian context
|
| 1053 |
+
- Content about ASEAN countries (Indonesia, Thailand, Vietnam, Philippines, Malaysia, Singapore, etc.)
|
| 1054 |
+
- Southeast Asian regional perspectives and economic integration
|
| 1055 |
+
- References to ASEAN policies, regional companies, or cultural phenomena
|
| 1056 |
+
- Example: ASEAN economic community, Indonesian elections, Singapore financial sector, Thai tourism
|
| 1057 |
+
|
| 1058 |
+
**`middle_eastern`** - Middle Eastern and North African context
|
| 1059 |
+
- Content about Arab states, Iran, Turkey, Israel, North Africa (MENA region)
|
| 1060 |
+
- Middle Eastern cultural perspectives, Islamic finance, regional conflicts
|
| 1061 |
+
- References to Middle Eastern institutions, oil economies, or geopolitics
|
| 1062 |
+
- Example: Gulf Cooperation Council, OPEC decisions, Middle East peace process, Islamic banking
|
| 1063 |
+
|
| 1064 |
+
**`sub_saharan_african`** - Sub-Saharan African context
|
| 1065 |
+
- Content about African countries south of the Sahara
|
| 1066 |
+
- African Union topics, sub-Saharan development issues
|
| 1067 |
+
- References to African institutions, economies, or cultural topics
|
| 1068 |
+
- Example: M-Pesa mobile banking, African Union policies, safari tourism, ubuntu philosophy
|
| 1069 |
+
|
| 1070 |
+
**`latin_american`** - Latin American context
|
| 1071 |
+
- Content about Central and South America, Caribbean
|
| 1072 |
+
- Latin American cultural perspectives, regional integration (Mercosur, etc.)
|
| 1073 |
+
- References to Latin American institutions, economies, or social movements
|
| 1074 |
+
- Example: Mercosur trade, telenovelas, Amazon rainforest, Latin American revolutions
|
| 1075 |
+
|
| 1076 |
+
**`oceanian`** - Oceanian context
|
| 1077 |
+
- Content about Australia, New Zealand, Pacific Island nations
|
| 1078 |
+
- Oceanian perspectives, Pacific regional issues
|
| 1079 |
+
- References to Oceanian institutions, companies, or cultural topics
|
| 1080 |
+
- Example: ANZAC relations, Pacific Island climate change, Australian mining, Māori culture
|
| 1081 |
+
|
| 1082 |
+
**`central_asian`** - Central Asian context
|
| 1083 |
+
- Content about Kazakhstan, Uzbekistan, Turkmenistan, Tajikistan, Kyrgyzstan
|
| 1084 |
+
- Central Asian perspectives, post-Soviet regional dynamics
|
| 1085 |
+
- Silk Road region, resource economies, nomadic heritage
|
| 1086 |
+
- Example: Silk Road initiatives, Caspian Sea resources, post-Soviet transitions
|
| 1087 |
+
|
| 1088 |
+
**`russian_sphere`** - Russian/Post-Soviet context
|
| 1089 |
+
- Content about Russia, Belarus, and strong Russian influence areas
|
| 1090 |
+
- Post-Soviet perspectives, CIS (Commonwealth of Independent States) topics
|
| 1091 |
+
- Russian language content about regional (not global) topics
|
| 1092 |
+
- Example: Russian federal politics, CIS integration, post-Soviet economic transitions
|
| 1093 |
+
|
| 1094 |
+
**`global`** - Genuinely international or universal
|
| 1095 |
+
- Content with truly global scope or application
|
| 1096 |
+
- International organizations, worldwide phenomena, global comparisons
|
| 1097 |
+
- Topics that transcend regional boundaries
|
| 1098 |
+
- Example: UN reports, climate change (global perspective), international standards, pandemic response
|
| 1099 |
+
|
| 1100 |
+
**`culturally_neutral`** - No clear regional focus
|
| 1101 |
+
- Abstract, theoretical, or technical content without regional markers
|
| 1102 |
+
- Universal scientific, mathematical, or philosophical content
|
| 1103 |
+
- Content that could apply equally anywhere without modification
|
| 1104 |
+
- Example: Mathematical proofs, chemical formulas, abstract philosophy, programming concepts
|
| 1105 |
+
|
| 1106 |
+
**`indeterminate`** - Cannot determine regional relevance
|
| 1107 |
+
- Insufficient content to identify regional focus
|
| 1108 |
+
- Mixed or contradictory regional signals
|
| 1109 |
+
- Fragment or corrupted content lacking regional context
|
| 1110 |
+
- Example: Technical specifications without context, isolated data tables
|
| 1111 |
+
|
| 1112 |
+
##### Multi-Regional Examples:
|
| 1113 |
+
- **EU-China trade relations** → `["european", "east_asian"]`
|
| 1114 |
+
- **NAFTA/USMCA impact on Mexican agriculture** → `["north_american", "latin_american"]`
|
| 1115 |
+
- **Indian diaspora in the Gulf states** → `["south_asian", "middle_eastern"]`
|
| 1116 |
+
- **Comparative study of healthcare systems globally** → `["global"]`
|
| 1117 |
+
|
| 1118 |
+
##### Regional Identification Guidelines:
|
| 1119 |
+
|
| 1120 |
+
**Primary indicators:**
|
| 1121 |
+
- **Geographic references**: Countries, cities, regions, landmarks mentioned
|
| 1122 |
+
- **Institutional references**: Governments, companies, universities, organizations specific to region
|
| 1123 |
+
- **Cultural markers**: Holidays, customs, cultural phenomena, sports, entertainment
|
| 1124 |
+
- **Political/economic systems**: References to regional political structures, economic blocs
|
| 1125 |
+
- **Legal/regulatory frameworks**: Region-specific laws, regulations, standards
|
| 1126 |
+
- **Language context**: While not determinative, language can provide regional hints
|
| 1127 |
+
|
| 1128 |
+
**Important distinctions:**
|
| 1129 |
+
- **Language ≠ Region**: Spanish content about Asian markets = `["east_asian"]`, not `["latin_american"]`
|
| 1130 |
+
- **Company origin vs. topic**: Apple (US company) operating in India = consider actual content focus
|
| 1131 |
+
- **Historical vs. current**: Historical content about ancient Rome = `["european"]` if discussing modern implications
|
| 1132 |
+
- **Diaspora content**: Content about diaspora communities should include both origin and current regions
|
| 1133 |
+
|
| 1134 |
+
**Quality checks:**
|
| 1135 |
+
- If content is in a non-English language but discusses global topics → still mark as `["global"]`
|
| 1136 |
+
- If content compares multiple regions → mark all regions discussed substantially
|
| 1137 |
+
- If content is about a specific place but has universal applications → consider both regional and global tags
|
| 1138 |
+
---
|
| 1139 |
+
|
| 1140 |
+
#### 18. Country Relevance
|
| 1141 |
+
|
| 1142 |
+
**What we're measuring**: Which specific country or countries (if any) the content is relevant to, globally.
|
| 1143 |
+
|
| 1144 |
+
**Note**: Always output an array of country names for this property (even when only a single country applies). Use standard country names from any region worldwide (e.g., "germany", "france", "united_states", "united_kingdom", "china", "japan", "brazil", "india", "south_africa", "australia", "canada", etc.). The array may also contain the special values `supranational` or `none`.
|
| 1145 |
+
|
| 1146 |
+
##### Values & Criteria:
|
| 1147 |
+
|
| 1148 |
+
**`{COUNTRY_NAME}`** - Content specifically relevant to a single country
|
| 1149 |
+
- Content explicitly about that country's politics, culture, institutions, or regulations
|
| 1150 |
+
- Content written from that country's cultural perspective
|
| 1151 |
+
- Content addressing that country's specific issues, regulations, or cultural phenomena
|
| 1152 |
+
- Content about that country's cities, companies, institutions, or country-specific topics
|
| 1153 |
+
- Example: For "germany" → German election coverage, Bundesliga content, German legal analysis
|
| 1154 |
+
- Example: For "united_states" → US election coverage, NFL content, US legal analysis
|
| 1155 |
+
- Example: For "japan" → Japanese politics, J-League content, Japanese cultural analysis
|
| 1156 |
+
- Only use country names listed in ISO-3166. Use "united_kingdom" instead of "england", "wales", etc.
|
| 1157 |
+
|
| 1158 |
+
**`supranational`** - For content focused on supranational entities or regions
|
| 1159 |
+
- International organizations, regional blocs, global institutions
|
| 1160 |
+
- Content about supranational policies, international organizations, global governance
|
| 1161 |
+
- Pan-regional analysis that transcends individual countries
|
| 1162 |
+
- Multi-continental or global institutional content
|
| 1163 |
+
- Example: UN resolutions, NATO discussions, EU policy analysis, ASEAN agreements, WTO trade rules
|
| 1164 |
+
|
| 1165 |
+
**`none`** - For content not specifically relevant to any country
|
| 1166 |
+
- Abstract, theoretical, or universal content without geographic specificity
|
| 1167 |
+
- Technical/scientific content that applies globally without country focus
|
| 1168 |
+
- Content that doesn't reference specific countries, cultures, or national contexts
|
| 1169 |
+
- Example: Mathematical proofs, universal scientific principles, abstract philosophical discussions
|
| 1170 |
+
|
| 1171 |
+
|
| 1172 |
+
##### Country Identification Criteria:
|
| 1173 |
+
- **Political content**: Elections, government policies, political parties, political figures specific to the country
|
| 1174 |
+
- **Cultural content**: National traditions, cultural phenomena, historical events specific to the country
|
| 1175 |
+
- **Institutional references**: Government bodies, national companies, universities specific to the country
|
| 1176 |
+
- **Geographic focus**: Cities, regions, landmarks within the country as primary subjects
|
| 1177 |
+
- **Legal/regulatory**: Laws, regulations, legal frameworks specific to the country
|
| 1178 |
+
- **Economic content**: National economic policies, country-specific market analysis
|
| 1179 |
+
- **Sports/media**: National sports leagues, national teams, country-specific media outlets
|
| 1180 |
+
- **Social issues**: Social policies, demographic topics, social movements specific to the country
|
| 1181 |
+
|
| 1182 |
+
---
|
res/bf16_vs_fp8.png
ADDED
|
Git LFS Details
|
res/eu_cofunding.png
ADDED
|
Git LFS Details
|
res/overall_scores_by_model.png
ADDED
|
Git LFS Details
|
res/per_property_scores_by_model.png
ADDED
|
Git LFS Details
|
res/propella_logo.svg
ADDED
|
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
|
| 3 |
+
size 11422654
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"additional_special_tokens": [
|
| 215 |
+
"<|im_start|>",
|
| 216 |
+
"<|im_end|>",
|
| 217 |
+
"<|object_ref_start|>",
|
| 218 |
+
"<|object_ref_end|>",
|
| 219 |
+
"<|box_start|>",
|
| 220 |
+
"<|box_end|>",
|
| 221 |
+
"<|quad_start|>",
|
| 222 |
+
"<|quad_end|>",
|
| 223 |
+
"<|vision_start|>",
|
| 224 |
+
"<|vision_end|>",
|
| 225 |
+
"<|vision_pad|>",
|
| 226 |
+
"<|image_pad|>",
|
| 227 |
+
"<|video_pad|>"
|
| 228 |
+
],
|
| 229 |
+
"bos_token": null,
|
| 230 |
+
"clean_up_tokenization_spaces": false,
|
| 231 |
+
"eos_token": "<|im_end|>",
|
| 232 |
+
"errors": "replace",
|
| 233 |
+
"extra_special_tokens": {},
|
| 234 |
+
"model_max_length": 1010000,
|
| 235 |
+
"pad_token": "<|endoftext|>",
|
| 236 |
+
"split_special_tokens": false,
|
| 237 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 238 |
+
"unk_token": null
|
| 239 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|