class BeamSearchOnlineMixin(ABC):
"""online serving for beam search"""
renderer: BaseRenderer
engine_client: EngineClient
async def beam_search(
self,
prompt: EngineInput,
request_id: str,
params: BeamSearchParams,
lora_request: LoRARequest | None = None,
trace_headers: Mapping[str, str] | None = None,
) -> AsyncGenerator[RequestOutput, None]:
beam_width = params.beam_width
max_tokens = params.max_tokens
ignore_eos = params.ignore_eos
temperature = params.temperature
length_penalty = params.length_penalty
include_stop_str_in_output = params.include_stop_str_in_output
tokenizer = self.renderer.get_tokenizer()
eos_token_id = tokenizer.eos_token_id
sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty)
if prompt["type"] == "embeds":
raise NotImplementedError("Embedding prompt not supported for beam search")
# Extract prompt tokens and text based on model type
decoder_prompt = (
prompt if prompt["type"] != "enc_dec" else prompt["decoder_prompt"]
)
prompt_text = decoder_prompt.get("prompt")
prompt_token_ids = decoder_prompt["prompt_token_ids"]
tokenized_length = len(prompt_token_ids)
logprobs_num = 2 * beam_width
sampling_params = SamplingParams(
logprobs=logprobs_num,
max_tokens=1,
temperature=temperature,
)
all_beams = [
BeamSearchSequence(
orig_prompt=prompt,
tokens=prompt_token_ids,
cum_logprob=0,
logprobs=[],
lora_request=lora_request,
)
]
completed = []
for _ in range(max_tokens):
tasks = []
request_id_batch = f"{request_id}-{random_uuid()}"
for i, beam in enumerate(all_beams):
prompt_item = beam.get_prompt()
lora_request_item = beam.lora_request
request_id_item = f"{request_id_batch}-beam-{i}"
task = asyncio.create_task(
collect_from_async_generator(
self.engine_client.generate(
prompt_item,
sampling_params,
request_id_item,
lora_request=lora_request_item,
trace_headers=trace_headers,
)
)
)
tasks.append(task)
output = [x[0] for x in await asyncio.gather(*tasks)]
new_beams = []
# Store all new tokens generated by beam
all_beams_token_id = []
# Store the cumulative probability of all tokens
# generated by beam search
all_beams_logprob = []
# Iterate through all beam inference results
for i, result in enumerate(output):
current_beam = all_beams[i]
# check for error finish reason and abort beam search
if result.outputs[0].finish_reason == "error":
# yield error output and terminate beam search
yield RequestOutput(
request_id=request_id,
prompt=prompt_text,
outputs=[
CompletionOutput(
index=0,
text="",
token_ids=[],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
],
finished=True,
prompt_token_ids=prompt_token_ids,
prompt_logprobs=None,
)
return
if result.outputs[0].logprobs is not None:
logprobs = result.outputs[0].logprobs[0]
all_beams_token_id.extend(list(logprobs.keys()))
all_beams_logprob.extend(
[
current_beam.cum_logprob + obj.logprob
for obj in logprobs.values()
]
)
# Handle the token for the end of sentence (EOS)
all_beams_token_id = np.array(all_beams_token_id)
all_beams_logprob = np.array(all_beams_logprob)
if not ignore_eos:
# Get the index position of eos token in all generated results
eos_idx = np.where(all_beams_token_id == eos_token_id)[0]
for idx in eos_idx:
current_beam = all_beams[idx // logprobs_num]
result = output[idx // logprobs_num]
assert result.outputs[0].logprobs is not None
logprobs_entry = result.outputs[0].logprobs[0]
completed.append(
BeamSearchSequence(
orig_prompt=prompt,
tokens=current_beam.tokens + [eos_token_id]
if include_stop_str_in_output
else current_beam.tokens,
logprobs=current_beam.logprobs + [logprobs_entry],
cum_logprob=float(all_beams_logprob[idx]),
finish_reason="stop",
stop_reason=eos_token_id,
)
)
# After processing, set the log probability of the eos condition
# to negative infinity.
all_beams_logprob[eos_idx] = -np.inf
# Processing non-EOS tokens
# Get indices of the top beam_width probabilities
topn_idx = np.argpartition(np.negative(all_beams_logprob), beam_width)[
:beam_width
]
for idx in topn_idx:
current_beam = all_beams[idx // logprobs_num]
result = output[idx // logprobs_num]
token_id = int(all_beams_token_id[idx])
assert result.outputs[0].logprobs is not None
logprobs_entry = result.outputs[0].logprobs[0]
new_beams.append(
BeamSearchSequence(
orig_prompt=prompt,
tokens=current_beam.tokens + [token_id],
logprobs=current_beam.logprobs + [logprobs_entry],
lora_request=current_beam.lora_request,
cum_logprob=float(all_beams_logprob[idx]),
)
)
all_beams = new_beams
completed.extend(all_beams)
sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
best_beams = sorted_completed[:beam_width]
for beam in best_beams:
if beam.tokens[-1] == eos_token_id and not ignore_eos:
# Skip the eos token in the text.
tokens = beam.tokens[tokenized_length:-1]
else:
tokens = beam.tokens[tokenized_length:]
beam.text = tokenizer.decode(tokens)
yield RequestOutput(
request_id=request_id,
prompt=prompt_text,
outputs=[
CompletionOutput(
text=beam.text, # type: ignore
cumulative_logprob=beam.cum_logprob,
token_ids=beam.tokens[tokenized_length:],
index=i,
logprobs=beam.logprobs,
finish_reason=beam.finish_reason
if beam.finish_reason is not None
else "length",
stop_reason=beam.stop_reason,
)
for (i, beam) in enumerate(best_beams)
],
finished=True,
prompt_token_ids=prompt_token_ids,
prompt_logprobs=None,
)