Skip to content

Implementation of msgcat and msgmerge utilities from GNU gettext #1161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3ae7cbd
Add definitions for MessageConcatenation and MessageMerge classes in …
soft-suroleb Dec 9, 2024
246671a
Implement basic logic for concatenating catalogs
soft-suroleb Dec 9, 2024
9a217e2
Add options: unique, less-than, more-than, no-wrap, and width
soft-suroleb Dec 9, 2024
a1bf8d4
Implement basic msgmerge logic for working with a compendium
soft-suroleb Dec 9, 2024
a5458fb
Write tests for msgcat
soft-suroleb Dec 9, 2024
6020107
Write tests for msgmerge
soft-suroleb Dec 9, 2024
cb71c93
Add options update, backup, and c_overwrite for a different compendiu…
soft-suroleb Dec 9, 2024
99ac987
Add test for msgmerge compendium overwrite mode with no comments
soft-suroleb Dec 9, 2024
7228cea
Refactor test for msgmerge with compendium-overwrite option
soft-suroleb Dec 9, 2024
2dababc
Create a catalog without fuzzy by default, remove add-location
soft-suroleb Dec 9, 2024
888cdd0
Add tests for using msgcat with plural message forms
soft-suroleb Dec 9, 2024
5202291
Rename msgmerge to merge and msgcat to concat
soft-suroleb Dec 9, 2024
efe2502
Add discription to all options
soft-suroleb Dec 9, 2024
133c8db
Merge branch 'master' into master
soft-suroleb Mar 23, 2025
4cbe604
Ability to specify multiple compendiums
soft-suroleb Mar 2, 2025
8568e90
Marking conflicting messages
soft-suroleb Mar 2, 2025
3f37414
Fix PR issues
soft-suroleb Mar 23, 2025
1817bce
Add info about pybable concat and pybabel merge into docs
soft-suroleb Apr 7, 2025
dd44348
Add usage documentation for pybabel concat and merge commands
soft-suroleb Apr 21, 2025
0a6388d
Fix PR issues
soft-suroleb Apr 27, 2025
5828c13
Add '_conflicts' getter in catalog
soft-suroleb Apr 27, 2025
bbba96e
Rework tests from unittest to pytest
soft-suroleb Apr 27, 2025
01f3793
Merge branch 'master' into master
soft-suroleb Apr 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion babel/messages/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@

import datetime
import re
import os
from collections.abc import Iterable, Iterator
from copy import copy
from difflib import SequenceMatcher
from email import message_from_string
from heapq import nlargest
from string import Formatter
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, TypedDict

from babel import __version__ as VERSION
from babel.core import Locale, UnknownLocaleError
Expand Down Expand Up @@ -338,6 +339,13 @@ def _force_text(s: str | bytes, encoding: str = 'utf-8', errors: str = 'strict')
return str(s)


class ConflictInfo(TypedDict):
message: Message
file_name: str
project: str
version: str


class Catalog:
"""Representation of a message catalog."""

Expand Down Expand Up @@ -381,6 +389,7 @@ def __init__(
self.locale = locale
self._header_comment = header_comment
self._messages: dict[str | tuple[str, str], Message] = {}
self._conflicts: dict[str | tuple[str, str], list[ConflictInfo]] = {}

self.project = project or 'PROJECT'
self.version = version or 'VERSION'
Expand Down Expand Up @@ -747,6 +756,19 @@ def __setitem__(self, id: _MessageID, message: Message) -> None:
f"Expected sequence but got {type(message.string)}"
self._messages[key] = message

def add_conflict(self, message: Message, file_name: str, project: str, version: str):
key = message.id
if key not in self._conflicts:
self._conflicts[key] = []

self._conflicts[key].append({
'message': message,
'file_name': file_name,
'project': project,
'version': version,
})
message.flags |= {'fuzzy'}

def add(
self,
id: _MessageID,
Expand Down
248 changes: 247 additions & 1 deletion babel/messages/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
import sys
import tempfile
import warnings
from collections import defaultdict
from configparser import RawConfigParser
from io import StringIO
from typing import BinaryIO, Iterable, Literal

from babel import Locale, localedata
from babel import __version__ as VERSION
from babel.core import UnknownLocaleError
from babel.messages.catalog import DEFAULT_HEADER, Catalog
from babel.messages.catalog import DEFAULT_HEADER, Catalog, ConflictInfo
from babel.messages.extract import (
DEFAULT_KEYWORDS,
DEFAULT_MAPPING,
Expand Down Expand Up @@ -852,6 +853,247 @@ def run(self):
return


class ConcatenateCatalog(CommandMixin):
description = 'concatenates the specified PO files into single one'
user_options = [
('input-files', None, 'input files'),
('output-file=', 'o', 'write output to specified file'),
('less-than=', '<', 'print messages with less than this many'
'definitions, defaults to infinite if not set '),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
'definitions, defaults to infinite if not set '),
'definitions, defaults to infinite if not set'),

('more-than=', '>', 'print messages with more than this many '
'definitions, defaults to 0 if not set'),
('unique', 'u', 'shorthand for --less-than=2, requests '
'that only unique messages be printed'),
('use-first', None, 'use first available translation for each '
'message, don\'t merge several translations'),
('no-location', None, 'do not write \'#: filename:line\' lines'),
('width=', 'w', 'set output page width'),
('no-wrap', None, 'do not break long message lines, longer than '
'the output page width, into several lines'),
('sort-output', 's', 'generate sorted output'),
('sort-by-file', 'F', 'sort output by file location'),
Comment on lines +869 to +874
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the same help texts here too as in the pre-existing tools, for consistency.

]

as_args = 'input-files'

boolean_options = [
'unique',
'use-first',
'no-location',
'strict',
'no-wrap',
'sort-output',
'sort-by-file',
]

def initialize_options(self):
self.input_files = None
self.output_file = None
self.less_than = None
self.more_than = 0
self.unique = False
self.use_first = False
self.no_location = None
self.width = None
self.no_wrap = False
self.sort_output = False
self.sort_by_file = False

def finalize_options(self):
if not self.input_files:
raise OptionError('you must specify the input files')
if not self.output_file:
raise OptionError('you must specify the output file')
Comment on lines +905 to +906
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this not output to stdout?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want the result to be output to stdout or what? It's just that each command in frontend has a similar exception for the output_file

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant that it would probably be useful for this command to be able to output to stdout (if no specific output file is set).

msgcat supports this:

The results are written to standard output if no output file is specified or if it is ‘-’.


if self.no_wrap and self.width:
raise OptionError("'--no-wrap' and '--width' are mutually exclusive")
if not self.no_wrap and not self.width:
self.width = 76
elif self.width is not None:
self.width = int(self.width)

if self.more_than is None:
self.more_than = 0
else:
self.more_than = int(self.more_than)
if self.less_than is not None:
self.less_than = int(self.less_than)
if self.unique:
self.less_than = 2
Comment on lines +921 to +922
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess unique, more_than and less_than are all mutually exclusive?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite so. More_than and less_than are like bounds. Unique means less_than=2 but boolean

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean... does it make sense to be able to set more_than and less_than along with unique?


def _prepare(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to do a bit more than just preparation... maybe a better name?

templates: list[tuple[str, Catalog]] = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it could be a dict (they are ordered these days, anyway, if the order makes a difference).

message_info = {}

for filename in self.input_files:
with open(filename, 'r') as pofile:
template = read_po(pofile)
for message in template:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably also filter out messages with a nullish id.

if message.id not in message_info:
message_info[message.id] = {
'count': 0,
'strings': set(),
}
message_info[message.id]['count'] += 1
message_info[message.id]['strings'].add(message.string if isinstance(message.string, str) else tuple(message.string))
templates.append((filename, template, ))

return templates, message_info
Comment on lines +925 to +941
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would probably end up being simpler without using dicts in message_info, too?

e.g.

Suggested change
templates: list[tuple[str, Catalog]] = []
message_info = {}
for filename in self.input_files:
with open(filename, 'r') as pofile:
template = read_po(pofile)
for message in template:
if message.id not in message_info:
message_info[message.id] = {
'count': 0,
'strings': set(),
}
message_info[message.id]['count'] += 1
message_info[message.id]['strings'].add(message.string if isinstance(message.string, str) else tuple(message.string))
templates.append((filename, template, ))
return templates, message_info
templates: dict[[str, Catalog]] = {}
message_counts = collections.Counter()
message_strings = collections.defaultdict(set)
for filename in self.input_files:
with open(filename, 'r') as pofile:
template = read_po(pofile)
templates[filename] = template
for message in template:
message_counts[message.id] += 1
message_string_tuple = message.string if isinstance(message.string, str) else tuple(message.string)
message_strings[message.id].add(message_string_tuple)
return templates, message_counts, message_strings


def run(self):
catalog = Catalog(fuzzy=False)
templates, message_info = self._prepare()

for path, template in templates:
if catalog.locale is None:
catalog.locale = template.locale

for message in template:
if not message.id:
continue

count = message_info[message.id]['count']
diff_string_count = len(message_info[message.id]['strings'])
if count <= self.more_than or (self.less_than is not None and count >= self.less_than):
continue

if count > 1 and not self.use_first and diff_string_count > 1:
file_name = os.path.basename(path)
catalog.add_conflict(message, file_name, template.project, template.version)

catalog[message.id] = message

catalog.fuzzy = any(message.fuzzy for message in catalog)

with open(self.output_file, 'wb') as outfile:
write_po(
outfile,
catalog,
width=self.width,
sort_by_file=self.sort_by_file,
sort_output=self.sort_output,
no_location=self.no_location,
)


class MergeCatalog(CommandMixin):
description='updates translation PO file by merging them with updated template POT file with using compendium'
user_options=[
('input-files', None, 'def.po (obsolete translations) ref.pot (actual template)'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
('input-files', None, 'def.po (obsolete translations) ref.pot (actual template)'),
('input-files', None, 'exactly two input files: def.po (obsolete translations); ref.pot (current template)'),

I assume "actual" means "current" here (common false friend) :)

('compendium=', 'C', 'additional library of message translations, may be specified more than once'),
('compendium-overwrite', '', 'overwrite mode of compendium'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This help text doesn't really help much... 😄

('no-compendium-comment', '', ''),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this have a help text?

('update', 'U', 'pdate def.po, do nothing if def.po already up to date'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
('update', 'U', 'pdate def.po, do nothing if def.po already up to date'),
('update', 'U', 'update def.po, do nothing if def.po already up to date'),

('output-file=', 'o', 'write output to specified file, the results are written '
'to standard output if no output file is specified'),
Comment on lines +987 to +988
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"the results are written to standard output if no output file is specified" – does that currently work?

('backup', None, 'make a backup of def.po'),
('suffix=', None, 'override the usual backup suffix'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What "usual backup suffix"? 🤔

('no-fuzzy-matching', 'N', 'do not use fuzzy matching'),
('no-location', None, 'suppress \'#: filename:line\' lines'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the same text as in the other tools' no-location option:

Suggested change
('no-location', None, 'suppress \'#: filename:line\' lines'),
('no-location', None, 'do not include location comments with filename and line number'),

('width=', 'w', 'set output page width'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the same text as in the other tools' width option:

Suggested change
('width=', 'w', 'set output page width'),
('width=', 'w', 'set output line width (default 76)'),

('no-wrap', None, 'do not break long message lines, longer '
'than the output page width, into several lines'),
('sort-output', 's', 'generate sorted output'),
('sort-by-file', 'F', 'sort output by file location'),
]

as_args = 'input-files'

multiple_value_options = (
'compendium'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
'compendium'
'compendium',

)

boolean_options = [
'compendium-overwrite',
'no-compendium-comment',
'update',
'backup',
'no-fuzzy-matching',
'no-location',
'no-wrap',
'sort-output',
'sort-by-file',
]

def initialize_options(self):
self.input_files = None
self.compendium = None
self.compendium_overwrite = False
self.no_compendium_comment = False
self.update = False
self.output_file = None
self.backup = False
self.suffix = '~'
self.no_fuzzy_matching = False
self.no_location = False
self.width = None
self.no_wrap = False
self.sort_output = False
self.sort_by_file = False

def finalize_options(self):
if not self.input_files or len(self.input_files) != 2:
raise OptionError('must be two po files')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the order of the files has some semantics to it? definition file, reference file..?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the first one is a file with obsolete translations, the second one is new actual .pot file

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be useful information here in the error message too.

if not self.output_file and not self.update:
raise OptionError('you must specify the output file or update existing')

if self.no_wrap and self.width:
raise OptionError("'--no-wrap' and '--width' are mutually exclusive")
if not self.no_wrap and not self.width:
self.width = 76
elif self.width is not None:
self.width = int(self.width)

def _get_message_from_compendium(self, compendium):
for file_path in compendium:
Comment on lines +1047 to +1048
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if the CLI argument is in the singular, it would be nicer to read here in the plural.

Also, this clearly returns things in the plural:

Suggested change
def _get_message_from_compendium(self, compendium):
for file_path in compendium:
def _get_messages_from_compendiums(self, compendium_paths):
if not compendium_paths:
return
for file_path in compendium_paths:

with open(file_path, 'r') as pofile:
catalog = read_po(pofile)
for message in catalog:
yield message, file_path

def run(self):
def_file, ref_file = self.input_files

with open(def_file, 'r') as pofile:
catalog = read_po(pofile)
with open(ref_file, 'r') as pofile:
ref_catalog = read_po(pofile)
catalog.update(
ref_catalog,
no_fuzzy_matching=self.no_fuzzy_matching
)

if self.compendium:
for message, compendium_path in self._get_message_from_compendium(self.compendium):
current = catalog[message.id]
if message.id in catalog and (not current.string or current.fuzzy or self.compendium_overwrite):
Comment on lines +1068 to +1069
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good place to use the walrus operator – the unlearned reader doesn't know that catalog.__getitem__ can return Nones, so it looks weird that you'd first grab a thing from a dict-like object, and only then check if it is in there.

Suggested change
current = catalog[message.id]
if message.id in catalog and (not current.string or current.fuzzy or self.compendium_overwrite):
if (current := catalog.get(message.id)) and (not current.string or current.fuzzy or self.compendium_overwrite):

if self.compendium_overwrite and not current.fuzzy and current.string:
catalog.obsolete[message.id] = current.clone()

current.string = message.string
if current.fuzzy:
current.flags.remove('fuzzy')

if not self.no_compendium_comment:
current.auto_comments.append(compendium_path)
Comment on lines +1066 to +1078
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be unnested, and get_messages_from_compendiums could early-return if the passed argument is falsy.

Suggested change
if self.compendium:
for message, compendium_path in self._get_message_from_compendium(self.compendium):
current = catalog[message.id]
if message.id in catalog and (not current.string or current.fuzzy or self.compendium_overwrite):
if self.compendium_overwrite and not current.fuzzy and current.string:
catalog.obsolete[message.id] = current.clone()
current.string = message.string
if current.fuzzy:
current.flags.remove('fuzzy')
if not self.no_compendium_comment:
current.auto_comments.append(compendium_path)
for message, compendium_path in self._get_message_from_compendium(self.compendium):
current = catalog[message.id]
if message.id in catalog and (not current.string or current.fuzzy or self.compendium_overwrite):
if self.compendium_overwrite and not current.fuzzy and current.string:
catalog.obsolete[message.id] = current.clone()
current.string = message.string
if current.fuzzy:
current.flags.remove('fuzzy')
if not self.no_compendium_comment:
current.auto_comments.append(compendium_path)


catalog.fuzzy = any(message.fuzzy for message in catalog)
output_path = def_file if self.update else self.output_file

if self.update and self.backup:
shutil.copy(def_file, def_file + self.suffix)

with open(output_path, 'wb') as outfile:
write_po(
outfile,
catalog,
no_location=self.no_location,
width=self.width,
sort_by_file=self.sort_by_file,
sort_output=self.sort_output,
)


class CommandLineInterface:
"""Command-line interface.

Expand All @@ -866,13 +1108,17 @@ class CommandLineInterface:
'extract': 'extract messages from source files and generate a POT file',
'init': 'create new message catalogs from a POT file',
'update': 'update existing message catalogs from a POT file',
'concat': 'concatenates and merges the specified PO files',
'merge': 'combines two Uniforum-style PO files into one',
}

command_classes = {
'compile': CompileCatalog,
'extract': ExtractMessages,
'init': InitCatalog,
'update': UpdateCatalog,
'concat': ConcatenateCatalog,
'merge': MergeCatalog,
}

log = None # Replaced on instance level
Expand Down
Loading